pax_global_header 0000666 0000000 0000000 00000000064 14634250137 0014517 g ustar 00root root 0000000 0000000 52 comment=910521c9475298a9a78d434d0bba8ee9397dacf6
cython-blis-1.0.0/ 0000775 0000000 0000000 00000000000 14634250137 0013750 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/.appveyor.yml 0000664 0000000 0000000 00000001712 14634250137 0016417 0 ustar 00root root 0000000 0000000 environment:
matrix:
- BLIS_ARCH: "generic"
- BLIS_ARCH: "x86_64"
install:
- git submodule update --init --recursive
- cd flame-blis
- set "CC=clang"
- set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%"
- set "PATH=C:\Program Files\LLVM\bin;%PATH%"
- set "AR=llvm-ar"
- set "AS=llvm-as"
- call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
build_script:
- set RANLIB=echo
- set LIBPTHREAD=
- set "PATH=%PATH%;C:\blis\lib"
- set "CFLAGS=-Wno-macro-redefined"
- cd %APPVEYOR_BUILD_FOLDER%
- cd flame-blis
- bash -lc "ln -s $APPVEYOR_BUILD_FOLDER /c/projects/cython-blis"
- bash -lc "cd /c/projects/cython-blis/ && ./bin/generate-make-jsonl windows $BLIS_ARCH --export"
artifacts:
- path: blis/_src/make
name: windows-generic.jsonl
- path: blis/_src/make
name: windows-x86_64.jsonl
- path: blis/_src/include/windows-generic
name: blis.h
- path: blis/_src/include/windows-x86_64
name: blis.h
cython-blis-1.0.0/.github/ 0000775 0000000 0000000 00000000000 14634250137 0015310 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/.github/workflows/ 0000775 0000000 0000000 00000000000 14634250137 0017345 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/.github/workflows/tests.yml 0000664 0000000 0000000 00000006547 14634250137 0021246 0 ustar 00root root 0000000 0000000 name: tests
on:
push:
paths-ignore:
- "*.md"
pull_request:
types: [opened, synchronize, reopened, edited]
paths-ignore:
- "*.md"
env:
MODULE_NAME: 'blis'
RUN_MYPY: 'false'
jobs:
tests:
name: Test
if: github.repository_owner == 'explosion'
strategy:
fail-fast: true
matrix:
os: [ubuntu-latest, windows-latest, macos-13]
python_version: ["3.9", "3.10", "3.11", "3.12"]
runs-on: ${{ matrix.os }}
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Preinstall (Windows)
shell: bash
if: startsWith(matrix.os, 'windows')
run: |
choco install llvm
- name: Install dependencies
run: |
python -m pip install -U build pip setuptools wheel
- name: Build wheel (Windows)
if: startsWith(matrix.os, 'windows')
run: |
set "PATH=C:\Program Files\LLVM\bin;%PATH%"
set "AR=llvm-ar"
set "AS=llvm-as"
set "CC=clang"
set RANLIB=echo
clang --version
python -m build --wheel
- name: Build wheel (Mac)
if: startsWith(matrix.os, 'macos')
run: |
python -m build --wheel
- name: Build wheel (Linux / clang)
if: startsWith(matrix.os, 'ubuntu') && matrix.python_version == '3.6'
run: |
clang --version
CC=clang python -m build --wheel
- name: Build wheel (Linux / gcc-9)
if: startsWith(matrix.os, 'ubuntu') && (matrix.python_version == '3.7' || matrix.python_version == '3.8')
run: |
gcc-9 --version
CC=gcc-9 python -m build --wheel
- name: Build wheel (Linux / gcc-10)
if: startsWith(matrix.os, 'ubuntu') && matrix.python_version == '3.9'
run: |
gcc-10 --version
CC=gcc-10 python -m build --wheel
- name: Build wheel (Linux / clang-13)
if: startsWith(matrix.os, 'ubuntu') && matrix.python_version == '3.10'
run: |
clang-13 --version
CC=clang-13 python -m build --wheel
- name: Build wheel (Linux / gcc)
if: startsWith(matrix.os, 'ubuntu') && (matrix.python_version == '3.11' || matrix.python_version == '3.12')
run: |
gcc --version
CC=gcc python -m build --wheel
# TODO: install mypy from requirements if reenabled
- name: Run mypy
shell: bash
if: ${{ env.RUN_MYPY == 'true' }}
run: |
python -m mypy $MODULE_NAME
- name: Uninstall all packages
run: |
python -m pip freeze > installed.txt
python -m pip uninstall -y -r installed.txt
- name: Install wheel
shell: bash
run: |
python -m pip install dist/*.whl
- name: Delete source directory
shell: bash
run: |
rm -rf $MODULE_NAME
- name: Test import
shell: bash
run: |
python -c "import $MODULE_NAME" -Werror
- name: Install test requirements
run: |
python -m pip install -U -r requirements.txt
- name: Run tests
shell: bash
run: |
python -m pytest --pyargs $MODULE_NAME -Werror
cython-blis-1.0.0/.gitignore 0000664 0000000 0000000 00000000333 14634250137 0015737 0 ustar 00root root 0000000 0000000 .*.sw*
blis/blis.c
*.pyc
*.so
blis/cy.c
blis/py.c
.eggs
.env/
env3.6
.hypothesis/
build/
cache/
__pycache__/
.python-version
cythonize.dat
dist/
.pytest_cache
blis.egg-info/
tmp/
# Blis stuff
blis/include
.fragment.mk
cython-blis-1.0.0/.gitmodules 0000664 0000000 0000000 00000000125 14634250137 0016123 0 ustar 00root root 0000000 0000000 [submodule "flame-blis"]
path = flame-blis
url = https://github.com/explosion/blis
cython-blis-1.0.0/.travis.yml 0000775 0000000 0000000 00000004543 14634250137 0016072 0 ustar 00root root 0000000 0000000 sudo: required
dist: focal
env:
global:
- PLAT=x86_64
- UNICODE_WIDTH=32
- CC=gcc-9
matrix:
include:
- os: osx
language: generic
env:
- MB_PYTHON_VERSION="3.7"
- BLIS_ARCH="generic"
- os: osx
language: generic
env:
- MB_PYTHON_VERSION="3.7"
- BLIS_ARCH="x86_64"
- os: linux
python: '3.7'
env:
- BLIS_ARCH="generic"
- os: linux
language: python
python: '3.7'
env:
- BLIS_ARCH="x86_64"
- os: linux
language: python
arch: arm64
python: '3.7'
env:
- BLIS_ARCH="cortexa57"
- os: linux
language: shell
arch: ppc64le
dist: focal
env:
- BLIS_ARCH="power9"
before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then source ./bin/travis/before_install_osx.sh;
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then source ./bin/travis/before_install_linux.sh;
fi
- before_install
install:
- python -m pip install -r requirements.txt
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./bin/generate-make-jsonl darwin $BLIS_ARCH --export;
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./bin/generate-make-jsonl linux $BLIS_ARCH --export;
fi
script:
- python setup.py build_ext --inplace
- PYTHONPATH=`pwd` python -m pytest blis/
notifications:
email: false
slack:
secure: VSqtxg7u4NTZRfoZqjxPRPVS92KTy/mp62egfDZ9ujTP4VPxNe15QZuTB6r/ICPgEYqBtdhLc/aetuBcemt0bHfentV0F7bz7iDY/AFQC1h1i4G0D0wKMufuqOJFw9MOp2tSpuvCVzhCxR+Ymx/F9SaeYBAiwBawce4wu+qu3lA=
deploy:
skip_cleanup: true
provider: gcs
access_key_id: GOOGAYJSXD24MLFQGHMJ6TQC
secret_access_key:
secure: 8SbYhu799pawZfC0a/Jq7eQklvfRNn1hJRnuEEpRdBO6fnFNMeYtTaSb867dwNl00i4VuQAjfcE8RXleY3EeP18qtmqfknCnOLCrSHphqWCYo/nx2wx/zC0E1xC4pefB2sO9nHEuKQVsi5OziNXunWedTh7n6CANoLRJmiypflvlLcOYp5eCLUcsoDbOtb7m2DDYXiCe8NM3ymZ2k42GmXqV2pvx14b0kl6okmAZJ3IMqfRLMXow5TxXwZx/AwW/N3FpitbhOAM2t10MWEdP4egkZlS+b2QKKnwvkocXAXstjokLsYBei8/9/AA2+ldtzT4HiBv6osPy4Y3MB68uyy3x+Q/4PZv7plxP2UPspyUCUCeYeY8CU3S8+8EjQhZYRphx2CibBLCOOpC68GxDcxMjXAgdm0FW1MLpbp/1NJRHgPQrpFvnKjjt01ysha50UGppigX6ebvH5fz4IIhMTRzMEhchCZR4GZvfHx0RaVJz21M5ngLGBQaV7pp99wCy8g/vtztOzwIKVP0VuCl4n31/Cit8QzNIQOQ0YoHJO1alr3SGyXmwnxx0r4DtRsPB70cGq45d7TuMVi7qTe7/gvHCG5rwC1X5YNTYiUae92j9niMLBMeuD5bToAnJIMYIwllTgyDuyo+u1a+fN5jJTtWgK/dBhrXvPaBZovupsmU=
bucket: cython-blis-artifacts
local-dir: artifacts
on:
repo: explosion/cython-blis
branch: master
cython-blis-1.0.0/.vscode/ 0000775 0000000 0000000 00000000000 14634250137 0015311 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/.vscode/settings.json 0000664 0000000 0000000 00000000057 14634250137 0020046 0 ustar 00root root 0000000 0000000 {
"python.pythonPath": ".env/bin/python3"
} cython-blis-1.0.0/LICENSE 0000664 0000000 0000000 00000003770 14634250137 0014764 0 ustar 00root root 0000000 0000000 NOTE: Portions of this project's code are copyrighted by
The University of Texas at Austin
while other portions are copyrighted by
Hewlett Packard Enterprise Development LP
Advanced Micro Devices, Inc.
ExplosionAI GmbH
with some overlap. Please see file-level license headers for file-specific
copyright info. All parties provide their portions of the code under the
3-clause BSD license, found below.
---
Copyright (C) 2018, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cython-blis-1.0.0/MANIFEST.in 0000664 0000000 0000000 00000000252 14634250137 0015505 0 ustar 00root root 0000000 0000000 include LICENSE
include README.md
exclude blis/cy.c
exclude blis/py.c
recursive-include blis/_src *.c
recursive-include blis/_src *.h
recursive-include blis/_src *.jsonl
cython-blis-1.0.0/README.md 0000664 0000000 0000000 00000015002 14634250137 0015225 0 ustar 00root root 0000000 0000000
# Cython BLIS: Fast BLAS-like operations from Python and Cython, without the tears
This repository provides the
[Blis linear algebra](https://github.com/flame/blis) routines as a
self-contained Python C-extension.
Currently, we only supports single-threaded execution, as this is actually best
for our workloads (ML inference).
[](https://github.com/explosion/cython-blis/actions/workflows/tests.yml)
[](https://pypi.python.org/pypi/blis)
[](https://anaconda.org/conda-forge/cython-blis)
[](https://github.com/explosion/wheelwright/releases)
## Installation
You can install the package via pip, first making sure that `pip`, `setuptools`,
and `wheel` are up-to-date:
```bash
pip install -U pip setuptools wheel
pip install blis
```
Wheels should be available, so installation should be fast. If you want to
install from source and you're on Windows, you'll need to install LLVM.
### Building BLIS for alternative architectures
The provided wheels should work on x86_64 and osx/arm64 architectures.
Unfortunately we do not currently know a way to provide different wheels for
alternative architectures, and we cannot provide a single binary that works
everywhere. So if the wheel doesn't work for your CPU, you'll need to specify
source distribution, and tell Blis your CPU architecture using the `BLIS_ARCH`
environment variable.
#### a) Install with auto-detected CPU support
```bash
pip install spacy --no-binary blis
```
#### b) Install using an existing configuration
Provide an architecture from the
[supported configurations](https://github.com/explosion/cython-blis/tree/v0.9.0/blis/_src/make).
```bash
BLIS_ARCH="power9" pip install spacy --no-binary blis
```
#### c) Install with generic arch support
> ⚠️ `generic` is not optimized for any particular CPU and is extremely slow.
> Only recommended for testing!
```bash
BLIS_ARCH="generic" pip install spacy --no-binary blis
```
#### d) Build specific support
In order to compile Blis, `cython-blis` bundles makefile scripts for specific
architectures, that are compiled by running the Blis build system and logging
the commands. We do not yet have logs for every architecture, as there are some
architectures we have not had access to.
[See here](https://github.com/flame/blis/blob/0.9.0/config_registry) for list of
architectures. For example, here's how to build support for the Intel
architecture `knl`:
```bash
git clone https://github.com/explosion/cython-blis && cd cython-blis
git pull && git submodule init && git submodule update && git submodule status
python3 -m venv venv
source venv/bin/activate
pip install -U pip setuptools wheel
pip install -r requirements.txt
./bin/generate-make-jsonl linux knl
BLIS_ARCH="knl" python setup.py build_ext --inplace
BLIS_ARCH="knl" python setup.py bdist_wheel
```
Fingers crossed, this will build you a wheel that supports your platform. You
could then [submit a PR](https://github.com/explosion/cython-blis/pulls) with
the `blis/_src/make/linux-knl.jsonl` and `blis/_src/include/linux-knl/blis.h`
files so that you can run:
```bash
BLIS_ARCH="knl" pip install --no-binary=blis
```
## Usage
Two APIs are provided: a high-level Python API, and direct
[Cython](http://cython.org) access, which provides fused-type, nogil Cython
bindings to the underlying Blis linear algebra library. Fused types are a simple
template mechanism, allowing just a touch of compile-time generic programming:
```python
cimport blis.cy
A = calloc(nN * nI, sizeof(float))
B = calloc(nO * nI, sizeof(float))
C = calloc(nr_b0 * nr_b1, sizeof(float))
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE,
nO, nI, nN,
1.0, A, nI, 1, B, nO, 1,
1.0, C, nO, 1)
```
Bindings have been added as we've needed them. Please submit pull requests if
the library is missing some functions you require.
## Development
To build the source package, you should run the following command:
```bash
./bin/update-vendored-source
```
This populates the `blis/_src` folder for the various architectures, using the
`flame-blis` submodule.
## Updating the build files
In order to compile the Blis sources, we use jsonl files that provide the
explicit compiler flags. We build these jsonl files by running Blis's build
system, and then converting the log. This avoids us having to replicate the
build system within Python: we just use the jsonl to make a bunch of subprocess
calls. To support a new OS/architecture combination, we have to provide the
jsonl file and the header.
### Linux
The Linux build files need to be produced from within the manylinux2014 Docker
container, so that they will be compatible with the wheel building process.
First, install docker. Then do the following to start the container:
sudo docker run -it quay.io/pypa/manylinux2014_x86_64:latest
Once within the container, the following commands should check out the repo and
build the jsonl files for the generic arch:
mkdir /usr/local/repos
cd /usr/local/repos
git clone https://github.com/explosion/cython-blis && cd cython-blis
git pull && git submodule init && git submodule update && git submodule
status
/opt/python/cp36-cp36m/bin/python -m venv env3.6
source env3.6/bin/activate
pip install -r requirements.txt
./bin/generate-make-jsonl linux generic --export
BLIS_ARCH=generic python setup.py build_ext --inplace
# N.B.: don't copy to /tmp, docker cp doesn't work from there.
cp blis/_src/include/linux-generic/blis.h /linux-generic-blis.h
cp blis/_src/make/linux-generic.jsonl /
Then from a new terminal, retrieve the two files we need out of the container:
sudo docker ps -l # Get the container ID
# When I'm in Vagrant, I need to go via cat -- but then I end up with dummy
# lines at the top and bottom. Sigh. If you don't have that problem and
# sudo docker cp just works, just copy the file.
sudo docker cp aa9d42588791:/linux-generic-blis.h - | cat > linux-generic-blis.h
sudo docker cp aa9d42588791:/linux-generic.jsonl - | cat > linux-generic.jsonl
cython-blis-1.0.0/azure-pipelines.yml 0000664 0000000 0000000 00000004050 14634250137 0017606 0 ustar 00root root 0000000 0000000 trigger:
batch: true
branches:
include:
- '*'
jobs:
- job: 'JSONL'
# Manually enable for generating JSONL
condition: false
strategy:
matrix:
Python38Mac:
imageName: 'macos-latest'
python.version: '3.8'
Python38Windows:
imageName: 'windows-latest'
python.version: '3.8'
maxParallel: 4
pool:
vmImage: $(imageName)
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
architecture: 'x64'
- script: choco install llvm
condition: eq( variables['Agent.OS'], 'Windows_NT')
displayName: 'Preinstall (Windows)'
- script: git config --global core.autocrlf false
displayName: 'Disable automatic crlf conversion'
- script: git submodule update --init --recursive
displayName: 'Update git modules'
- script: |
python -m pip install --upgrade pip wheel setuptools
pip install -r requirements.txt
displayName: 'Install dependencies'
# Set the correct paths and includes. Only the env variables set here are
# used, not the ones defined in the .jsonl.
- script: |
set "PATH=C:\Program Files\LLVM\bin;%PATH%"
set "AR=llvm-ar"
set "AS=llvm-as"
set "CC=clang"
set RANLIB=echo
call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
clang --version
bash -lc "./bin/generate-make-jsonl windows generic --export"
bash -lc "./bin/generate-make-jsonl windows x86_64 --export"
condition: eq( variables['Agent.OS'], 'Windows_NT')
displayName: 'Generate JSONL (Windows)'
- script: |
bin/generate-make-jsonl darwin generic --export
bin/generate-make-jsonl darwin x86_64 --export
bin/generate-make-jsonl darwin x86_64_no_zen3 --export
bin/generate-make-jsonl darwin x86_64_no_zen2 --export
bin/generate-make-jsonl darwin x86_64_no_skx --export
condition: eq(variables['Agent.OS'], 'Darwin')
displayName: 'Generate JSONL (Mac)'
- publish: $(System.DefaultWorkingDirectory)/artifacts
artifact: '$(Agent.JobName)'
cython-blis-1.0.0/bin/ 0000775 0000000 0000000 00000000000 14634250137 0014520 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/bin/.appveyor_compile_jsonl.yml 0000664 0000000 0000000 00000004041 14634250137 0022102 0 ustar 00root root 0000000 0000000 #environment:
#
# matrix:
# - PYTHON: "/c/Python35-x64"
# - PYTHON: "/c/Python36-x64"
# - PYTHON: "/c/Python37-x64"
install:
- git submodule update --init --recursive
- cd flame-blis
- set "CC=clang"
- set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%"
- set "PATH=C:\Program Files\LLVM\bin;%PATH%"
- set "AR=llvm-ar"
- set "AS=llvm-as"
- call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
build_script:
- set RANLIB=echo
- set LIBPTHREAD=
- set "PATH=%PATH%;C:\blis\lib"
- set "CFLAGS=-Wno-macro-redefined"
- cd %APPVEYOR_BUILD_FOLDER%
- cd flame-blis
- bash -lc "ln -s $APPVEYOR_BUILD_FOLDER /c/projects/cython-blis"
- bash -lc "cd /c/projects/cython-blis/flame-blis && ./configure --disable-shared --disable-cblas --disable-blas --disable-threading --enable-verbose-make --enable-arg-max-hack --prefix=/c/blis x86_64"
- bash -lc "cd /c/projects/cython-blis/flame-blis && mingw32-make -j4 > make.log"
- bash -lc "mkdir -p /c/projects/cython-blis/blis/_src/include/windows-x86_64"
- bash -lc "cd /c/projects/cython-blis && cat flame-blis/make.log | python bin/munge_make_log.py windows x86_64 > blis/_src/make/windows-x86_64.jsonl"
- bash -lc "cp /c/projects/cython-blis/flame-blis/include/x86_64/blis.h /c/projects/cython-blis/blis/_src/include/windows-x86_64/blis.h"
- bash -lc "cp /c/projects/cython-blis/blis/_src/make/windows-x86_64.jsonl $APPVEYOR_BUILD_FOLDER/blis/_src/make/windows-x86_64.jsonl"
- bash -lc "mkdir -p $APPVEYOR_BUILD_FOLDER/blis/_src/include/windows-x86_64"
- bash -lc "cp /c/projects/cython-blis/blis/_src/include/windows-x86_64/blis.h $APPVEYOR_BUILD_FOLDER/blis/_src/include/windows-x86_64/blis.h"
#- python -m pip install -U pip wheel
#- python -m pip install -r requirements.txt
#- python setup.py bdist_wheel
#- cd ..
#- bash -lc "cp -r $APPVEYOR_BUILD_FOLDER /c/build"
#- bash -lc "python -m pip install /c/build/dist/*.whl"
#test_script:
#- python -m pytest --pyargs blis
artifacts:
- path: blis/_src/make
name: windows-x86_64.jsonl
- path: blis/_src/include/windows-x86_64
name: blis.h
cython-blis-1.0.0/bin/.appveyor_run_tests.yml 0000664 0000000 0000000 00000000756 14634250137 0021304 0 ustar 00root root 0000000 0000000 environment:
matrix:
- PYTHON: "C:\\Python35-x64"
- PYTHON: "C:\\Python36-x64"
- PYTHON: "C:\\Python37-x64"
install:
- set "PATH=%PYTHON%;%path%"
build_script:
- python -m pip install -U pip wheel
- python -m pip install -r requirements.txt
- python setup.py bdist_wheel
- cd ..
- bash -lc "cp -r $APPVEYOR_BUILD_FOLDER /c/build"
- bash -lc "python -m pip install /c/build/dist/*.whl"
test_script:
- python -m pytest --pyargs blis
artifacts:
- path: dist/
name: wheels
cython-blis-1.0.0/bin/.update_blis_travis.yml 0000775 0000000 0000000 00000004325 14634250137 0021213 0 ustar 00root root 0000000 0000000 language: python
sudo: required
dist: xenial
env:
global:
- PLAT=x86_64
- UNICODE_WIDTH=32
matrix:
include:
- os: osx
language: generic
env:
- MB_PYTHON_VERSION="2.7"
- os: osx
language: generic
env:
- MB_PYTHON_VERSION="3.5"
- os: osx
language: generic
env:
- MB_PYTHON_VERSION="3.6"
- os: osx
language: generic
env:
- MB_PYTHON_VERSION="3.7"
- os: linux
python: '2.7'
- os: linux
python: '3.5'
- os: linux
python: '3.6'
- os: linux
python: '3.7'
before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then source ./travis/before_install_osx.sh;
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then source ./travis/before_install_linux.sh;
fi
- before_install
install:
- python -m pip install -r requirements.txt
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./bin/generate-make-jsonl darwin x86_64 --export;
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./bin/generate-make-jsonl linux x86_64 --export;
fi
- python setup.py bdist_wheel
- rm -rf blis*
- python -m pip install dist/*.whl
script:
- python -m pytest tests/
notifications:
email: false
slack:
secure: VSqtxg7u4NTZRfoZqjxPRPVS92KTy/mp62egfDZ9ujTP4VPxNe15QZuTB6r/ICPgEYqBtdhLc/aetuBcemt0bHfentV0F7bz7iDY/AFQC1h1i4G0D0wKMufuqOJFw9MOp2tSpuvCVzhCxR+Ymx/F9SaeYBAiwBawce4wu+qu3lA=
deploy:
skip_cleanup: true
provider: gcs
access_key_id: GOOGAYJSXD24MLFQGHMJ6TQC
secret_access_key:
secure: 8SbYhu799pawZfC0a/Jq7eQklvfRNn1hJRnuEEpRdBO6fnFNMeYtTaSb867dwNl00i4VuQAjfcE8RXleY3EeP18qtmqfknCnOLCrSHphqWCYo/nx2wx/zC0E1xC4pefB2sO9nHEuKQVsi5OziNXunWedTh7n6CANoLRJmiypflvlLcOYp5eCLUcsoDbOtb7m2DDYXiCe8NM3ymZ2k42GmXqV2pvx14b0kl6okmAZJ3IMqfRLMXow5TxXwZx/AwW/N3FpitbhOAM2t10MWEdP4egkZlS+b2QKKnwvkocXAXstjokLsYBei8/9/AA2+ldtzT4HiBv6osPy4Y3MB68uyy3x+Q/4PZv7plxP2UPspyUCUCeYeY8CU3S8+8EjQhZYRphx2CibBLCOOpC68GxDcxMjXAgdm0FW1MLpbp/1NJRHgPQrpFvnKjjt01ysha50UGppigX6ebvH5fz4IIhMTRzMEhchCZR4GZvfHx0RaVJz21M5ngLGBQaV7pp99wCy8g/vtztOzwIKVP0VuCl4n31/Cit8QzNIQOQ0YoHJO1alr3SGyXmwnxx0r4DtRsPB70cGq45d7TuMVi7qTe7/gvHCG5rwC1X5YNTYiUae92j9niMLBMeuD5bToAnJIMYIwllTgyDuyo+u1a+fN5jJTtWgK/dBhrXvPaBZovupsmU=
bucket: cython-blis-artifacts
local-dir: artifacts
on:
repo: explosion/cython-blis
branch: update-blis
cython-blis-1.0.0/bin/generate-make-jsonl 0000775 0000000 0000000 00000002031 14634250137 0020272 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
set -e
OS="$1"
ARCH="$2"
EXPORT="$3"
JSONL="blis/_src/make/$OS-$ARCH.jsonl"
cd flame-blis
if [ ! -f $JSONL ]; then
echo "Compile"
if [[ "$OS" == "windows" ]]; then
mingw32-make clean
./configure --disable-blas --disable-cblas --disable-shared --disable-threading --int-size=64 --enable-verbose-make --enable-arg-max-hack $ARCH
mingw32-make -j 4 > make.log
else
make clean
./configure --disable-blas --disable-cblas --disable-shared --disable-threading --int-size=64 --enable-verbose-make --export-shared=all $ARCH
make > make.log
fi
echo "Preprocess make log"
cat make.log | python ../bin/munge_make_log.py $OS $ARCH > ../$JSONL
mkdir -p ../blis/_src/include/$OS-$ARCH/
cp include/$ARCH/blis.h ../blis/_src/include/$OS-$ARCH/blis.h
fi
if [[ "$EXPORT" == "--export" ]]; then
mkdir -p ../artifacts/
cp ../blis/_src/include/$OS-$ARCH/blis.h ../artifacts/blis-$OS-$ARCH.h
cp ../blis/_src/make/$OS-$ARCH.jsonl ../artifacts/$OS-$ARCH.jsonl;
fi
cython-blis-1.0.0/bin/munge_make_log.py 0000664 0000000 0000000 00000002147 14634250137 0020047 0 ustar 00root root 0000000 0000000 import os
import sys
import json
os_name = sys.argv[1]
arch_name = sys.argv[2]
print(json.dumps({"environment": dict(os.environ)}))
for line in sys.stdin:
if 'flatten-headers.py' in line:
continue
line = line.replace('include/' + arch_name, 'include/' + os_name + '-' + arch_name)
pieces = line.split()
args = {}
flags = []
macros = []
includes = []
for i, piece in enumerate(pieces):
if i == 0:
args['compiler'] = piece
elif piece == '-c':
args['source'] = pieces[i+1]
elif piece == '-o':
args['target'] = pieces[i+1]
elif piece.startswith('-f') or piece.startswith('-m') or piece.startswith('-O'):
flags.append(piece)
elif piece.startswith('-std'):
flags.append(piece)
elif piece.startswith('-D'):
macros.append(piece.replace('\\', ''))
elif piece.startswith('-I'):
includes.append(piece)
if 'source' in args:
args['flags'] = flags
args['macros'] = macros
args['include'] = includes
print(json.dumps(args))
cython-blis-1.0.0/bin/push-tag.sh 0000775 0000000 0000000 00000000562 14634250137 0016612 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
set -e
# Insist repository is clean
git diff-index --quiet HEAD
git checkout $1
git pull origin $1
git push origin $1
version=$(grep "__version__ = " blis/about.py)
version=${version/__version__ = }
version=${version/\'/}
version=${version/\'/}
version=${version/\"/}
version=${version/\"/}
git tag "v$version"
git push origin "v$version" --tags
cython-blis-1.0.0/bin/travis/ 0000775 0000000 0000000 00000000000 14634250137 0016030 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/bin/travis/before_install_linux.sh 0000775 0000000 0000000 00000000154 14634250137 0022576 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
set -e
function before_install {
local passed=1
sudo apt-get install python-dev
}
cython-blis-1.0.0/bin/travis/before_install_osx.sh 0000775 0000000 0000000 00000023317 14634250137 0022256 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Use with ``source osx_utils.sh``
set -e
# Get our own location on this filesystem, load common utils
MULTIBUILD_DIR=$(dirname "${BASH_SOURCE[0]}")
source $MULTIBUILD_DIR/common_utils.sh
export MACOSX_DEPLOYMENT_TARGET=10.7
MACPYTHON_URL=https://www.python.org/ftp/python
MACPYTHON_PY_PREFIX=/Library/Frameworks/Python.framework/Versions
GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
DOWNLOADS_SDIR=downloads
WORKING_SDIR=working
# As of 28 June 2018 - latest Python of each version with binary download
# available.
# See: https://www.python.org/downloads/mac-osx/
LATEST_2p7=2.7.15
LATEST_2p6=2.6.6
LATEST_3p2=3.2.5
LATEST_3p3=3.3.5
LATEST_3p4=3.4.4
LATEST_3p5=3.5.4
LATEST_3p6=3.6.6
LATEST_3p7=3.7.0
function check_python {
if [ -z "$PYTHON_EXE" ]; then
echo "PYTHON_EXE variable not defined"
exit 1
fi
}
function check_pip {
if [ -z "$PIP_CMD" ]; then
echo "PIP_CMD variable not defined"
exit 1
fi
}
function check_var {
if [ -z "$1" ]; then
echo "required variable not defined"
exit 1
fi
}
function get_py_digit {
check_python
$PYTHON_EXE -c "import sys; print(sys.version_info[0])"
}
function get_py_mm {
check_python
$PYTHON_EXE -c "import sys; print('{0}.{1}'.format(*sys.version_info[0:2]))"
}
function get_py_mm_nodot {
check_python
$PYTHON_EXE -c "import sys; print('{0}{1}'.format(*sys.version_info[0:2]))"
}
function get_py_prefix {
check_python
$PYTHON_EXE -c "import sys; print(sys.prefix)"
}
function fill_pyver {
# Convert major or major.minor format to major.minor.micro
#
# Hence:
# 2 -> 2.7.11 (depending on LATEST_2p7 value)
# 2.7 -> 2.7.11 (depending on LATEST_2p7 value)
local ver=$1
check_var $ver
if [[ $ver =~ [0-9]+\.[0-9]+\.[0-9]+ ]]; then
# Major.minor.micro format already
echo $ver
elif [ $ver == 2 ] || [ $ver == "2.7" ]; then
echo $LATEST_2p7
elif [ $ver == "2.6" ]; then
echo $LATEST_2p6
elif [ $ver == 3 ] || [ $ver == "3.7" ]; then
echo $LATEST_3p7
elif [ $ver == "3.6" ]; then
echo $LATEST_3p6
elif [ $ver == "3.5" ]; then
echo $LATEST_3p5
elif [ $ver == "3.4" ]; then
echo $LATEST_3p4
elif [ $ver == "3.3" ]; then
echo $LATEST_3p3
elif [ $ver == "3.2" ]; then
echo $LATEST_3p2
else
echo "Can't fill version $ver" 1>&2
exit 1
fi
}
function pyinst_ext_for_version {
# echo "pkg" or "dmg" depending on the passed Python version
# Parameters
# $py_version (python version in major.minor.extra format)
#
# Earlier Python installers are .dmg, later are .pkg.
local py_version=$1
check_var $py_version
py_version=$(fill_pyver $py_version)
local py_0=${py_version:0:1}
if [ $py_0 -eq 2 ]; then
if [ "$(lex_ver $py_version)" -ge "$(lex_ver 2.7.9)" ]; then
echo "pkg"
else
echo "dmg"
fi
elif [ $py_0 -ge 3 ]; then
if [ "$(lex_ver $py_version)" -ge "$(lex_ver 3.4.2)" ]; then
echo "pkg"
else
echo "dmg"
fi
fi
}
function pyinst_fname_for_version {
# echo filename for OSX installer file given Python version
# Parameters
# $py_version (python version in major.minor.extra format)
local py_version=$1
local inst_ext=$(pyinst_ext_for_version $py_version)
# Python 2.6 has OSX 10.3 suffix
if [ "$(lex_ver $py_version)" -le "$(lex_ver 2.6.6)" ]; then
local osx_ver=10.3
else
local osx_ver=10.6
fi
echo "python-$py_version-macosx${osx_ver}.$inst_ext"
}
function install_mac_cpython {
# Installs Python.org Python
# Parameter $version
# Version given in major or major.minor or major.minor.micro e.g
# "3" or "3.4" or "3.4.1".
# sets $PYTHON_EXE variable to python executable
local py_version=$(fill_pyver $1)
local py_stripped=$(strip_ver_suffix $py_version)
local py_inst=$(pyinst_fname_for_version $py_version)
local inst_path=$DOWNLOADS_SDIR/$py_inst
mkdir -p $DOWNLOADS_SDIR
curl $MACPYTHON_URL/$py_stripped/${py_inst} > $inst_path
if [ "${py_inst: -3}" == "dmg" ]; then
hdiutil attach $inst_path -mountpoint /Volumes/Python
inst_path=/Volumes/Python/Python.mpkg
fi
sudo installer -pkg $inst_path -target /
local py_mm=${py_version:0:3}
PYTHON_EXE=$MACPYTHON_PY_PREFIX/$py_mm/bin/python$py_mm
# Install certificates for Python 3.6
local inst_cmd="/Applications/Python ${py_mm}/Install Certificates.command"
if [ -e "$inst_cmd" ]; then
sh "$inst_cmd"
fi
}
function install_pip {
# Generic install pip
# Gets needed version from version implied by $PYTHON_EXE
# Installs pip into python given by $PYTHON_EXE
# Assumes pip will be installed into same directory as $PYTHON_EXE
check_python
mkdir -p $DOWNLOADS_SDIR
curl $GET_PIP_URL > $DOWNLOADS_SDIR/get-pip.py
# Python 2.6 will fail SSL check
local py_mm=`get_py_mm`
if [ "$py_mm" == "2.6" ]; then
local pip_args="--trusted-host=pypi.org"
fi
# Travis VMS now install pip for system python by default - force install
# even if installed already.
sudo $PYTHON_EXE $DOWNLOADS_SDIR/get-pip.py --ignore-installed $pip_args
PIP_CMD="sudo $(dirname $PYTHON_EXE)/pip$py_mm"
# Append pip_args if present (avoiding trailing space cf using variable
# above).
if [ -n "$pip_args" ]; then
PIP_CMD="$PIP_CMD $pip_args"
fi
}
function install_virtualenv {
# Generic install of virtualenv
# Installs virtualenv into python given by $PYTHON_EXE
# Assumes virtualenv will be installed into same directory as $PYTHON_EXE
check_pip
# Travis VMS install virtualenv for system python by default - force
# install even if installed already
$PIP_CMD install virtualenv --ignore-installed
check_python
VIRTUALENV_CMD="$(dirname $PYTHON_EXE)/virtualenv"
}
function make_workon_venv {
# Make a virtualenv in given directory ('venv' default)
# Set $PYTHON_EXE, $PIP_CMD to virtualenv versions
# Parameter $venv_dir
# directory for virtualenv
local venv_dir=$1
if [ -z "$venv_dir" ]; then
venv_dir="venv"
fi
venv_dir=`abspath $venv_dir`
check_python
$PYTHON_EXE -m virtualenv $venv_dir
PYTHON_EXE=$venv_dir/bin/python
PIP_CMD=$venv_dir/bin/pip
}
function remove_travis_ve_pip {
# Remove travis installs of virtualenv and pip
# FIXME: What if virtualenv is installed but pip is not?
if [ "$(sudo which virtualenv)" == /usr/local/bin/virtualenv ] && [ "$(sudo which pip)" == /usr/local/bin/pip ]; then
sudo pip uninstall -y virtualenv;
fi
if [ "$(sudo which pip)" == /usr/local/bin/pip ]; then
sudo pip uninstall -y pip;
fi
}
function set_py_vars {
# Used by terryfy project; left here for back-compatibility
export PATH="`dirname $PYTHON_EXE`:$PATH"
export PYTHON_EXE PIP_CMD
}
function get_macpython_environment {
# Set up MacPython environment
# Parameters:
# $version : [implementation-]major[.minor[.patch]]
# The Python implementation to install, e.g. "3.6" or "pypy-5.4"
# $venv_dir : {directory_name|not defined}
# If defined - make virtualenv in this directory, set python / pip
# commands accordingly
#
# Installs Python
# Sets $PYTHON_EXE to path to Python executable
# Sets $PIP_CMD to full command for pip (including sudo if necessary)
# If $venv_dir defined, Sets $VIRTUALENV_CMD to virtualenv executable
# Puts directory of $PYTHON_EXE on $PATH
local version=$1
local venv_dir=$2
if [ "$USE_CCACHE" == "1" ]; then
activate_ccache
fi
remove_travis_ve_pip
install_mac_cpython $version
install_pip
if [ -n "$venv_dir" ]; then
install_virtualenv
make_workon_venv $venv_dir
source $venv_dir/bin/activate
else
export PATH="`dirname $PYTHON_EXE`:$PATH"
fi
export PYTHON_EXE PIP_CMD
}
function install_delocate {
check_pip
if [ $(lex_ver $(get_py_mm)) -lt $(lex_ver 2.7) ]; then
# Wheel 0.30 doesn't work for Python 2.6; see:
# https://github.com/pypa/wheel/issues/193
$PIP_CMD install "wheel<=0.29"
fi
$PIP_CMD install delocate
}
function repair_wheelhouse {
local wheelhouse=$1
install_delocate
delocate-wheel $wheelhouse/*.whl # copies library dependencies into wheel
# Add platform tags to label wheels as compatible with OSX 10.9 and
# 10.10. The wheels will be built against Python.org Python, and so will
# in fact be compatible with OSX >= 10.6. pip < 6.0 doesn't realize
# this, so, in case users have older pip, add platform tags to specify
# compatibility with later OSX. Not necessary for OSX released well
# after pip 6.0. See:
# https://github.com/MacPython/wiki/wiki/Spinning-wheels#question-will-pip-give-me-a-broken-wheel
delocate-addplat --rm-orig -x 10_9 -x 10_10 $wheelhouse/*.whl
}
function install_pkg_config {
# Install pkg-config avoiding error from homebrew
# See :
# https://github.com/matthew-brett/multibuild/issues/24#issue-221951587
command -v pkg-config > /dev/null 2>&1 || brew install pkg-config
}
function activate_ccache {
brew install ccache
export PATH=/usr/local/opt/ccache/libexec:$PATH
export CCACHE_CPP2=1
# Prove to the developer that ccache is activated
echo "Using C compiler: $(which clang)"
}
function before_install {
# Uninstall oclint. See Travis-CI gh-8826
brew cask uninstall oclint || true
export CC=clang
export CXX=clang++
get_macpython_environment $MB_PYTHON_VERSION venv
source venv/bin/activate
pip install --upgrade pip wheel
}
cython-blis-1.0.0/bin/travis/common_utils.sh 0000775 0000000 0000000 00000031610 14634250137 0021100 0 ustar 00root root 0000000 0000000 #!/bin/bash
# Utilities for both OSX and Docker Linux
# Python should be on the PATH
# Only source common_utils once
if [ -n "$COMMON_UTILS_SOURCED" ]; then
return
fi
COMMON_UTILS_SOURCED=1
# Turn on exit-if-error
set -e
MULTIBUILD_DIR=$(dirname "${BASH_SOURCE[0]}")
if [ $(uname) == "Darwin" ]; then IS_OSX=1; fi
# Work round bug in travis xcode image described at
# https://github.com/direnv/direnv/issues/210
shell_session_update() { :; }
# Workaround for https://github.com/travis-ci/travis-ci/issues/8703
# suggested by Thomas K at
# https://github.com/travis-ci/travis-ci/issues/8703#issuecomment-347881274
unset -f cd
unset -f pushd
unset -f popd
function start_spinner {
if [ -n "$MB_SPINNER_PID" ]; then
return
fi
>&2 echo "Building libraries..."
# Start a process that runs as a keep-alive
# to avoid travis quitting if there is no output
(while true; do
sleep 60
>&2 echo "Still building..."
done) &
MB_SPINNER_PID=$!
disown
}
function stop_spinner {
if [ ! -n "$MB_SPINNER_PID" ]; then
return
fi
kill $MB_SPINNER_PID
unset MB_SPINNER_PID
>&2 echo "Building libraries finished."
}
function abspath {
python -c "import os.path; print(os.path.abspath('$1'))"
}
function relpath {
# Path of first input relative to second (or $PWD if not specified)
python -c "import os.path; print(os.path.relpath('$1','${2:-$PWD}'))"
}
function realpath {
python -c "import os; print(os.path.realpath('$1'))"
}
function lex_ver {
# Echoes dot-separated version string padded with zeros
# Thus:
# 3.2.1 -> 003002001
# 3 -> 003000000
echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
}
function unlex_ver {
# Reverses lex_ver to produce major.minor.micro
# Thus:
# 003002001 -> 3.2.1
# 003000000 -> 3.0.0
echo "$((10#${1:0:3}+0)).$((10#${1:3:3}+0)).$((10#${1:6:3}+0))"
}
function strip_ver_suffix {
echo $(unlex_ver $(lex_ver $1))
}
function is_function {
# Echo "true" if input argument string is a function
# Allow errors during "set -e" blocks.
(set +e; $(declare -Ff "$1" > /dev/null) && echo true)
}
function gh-clone {
git clone https://github.com/$1
}
function set_opts {
# Set options from input options string (in $- format).
local opts=$1
local chars="exhimBH"
for (( i=0; i<${#chars}; i++ )); do
char=${chars:$i:1}
[ -n "${opts//[^${char}]/}" ] && set -$char || set +$char
done
}
function suppress {
# Run a command, show output only if return code not 0.
# Takes into account state of -e option.
# Compare
# https://unix.stackexchange.com/questions/256120/how-can-i-suppress-output-only-if-the-command-succeeds#256122
# Set -e stuff agonized over in
# https://unix.stackexchange.com/questions/296526/set-e-in-a-subshell
local tmp=$(mktemp tmp.XXXXXXXXX) || return
local opts=$-
echo "Running $@"
set +e
( set_opts $opts ; $@ > "$tmp" 2>&1 ) ; ret=$?
[ "$ret" -eq 0 ] || cat "$tmp"
rm -f "$tmp"
set_opts $opts
return "$ret"
}
function rm_mkdir {
# Remove directory if present, then make directory
local path=$1
if [ -z "$path" ]; then echo "Need not-empty path"; exit 1; fi
if [ -d "$path" ]; then rm -rf $path; fi
mkdir $path
}
function untar {
local in_fname=$1
if [ -z "$in_fname" ];then echo "in_fname not defined"; exit 1; fi
local extension=${in_fname##*.}
case $extension in
tar) tar -xf $in_fname ;;
gz|tgz) tar -zxf $in_fname ;;
bz2) tar -jxf $in_fname ;;
zip) unzip -qq $in_fname ;;
xz) unxz -c $in_fname | tar -xf ;;
*) echo Did not recognize extension $extension; exit 1 ;;
esac
}
function install_rsync {
if [ -z "$IS_OSX" ]; then
[[ $(type -P rsync) ]] || yum install -y rsync
fi
}
function fetch_unpack {
# Fetch input archive name from input URL
# Parameters
# url - URL from which to fetch archive
# archive_fname (optional) archive name
#
# Echos unpacked directory and file names.
#
# If `archive_fname` not specified then use basename from `url`
# If `archive_fname` already present at download location, use that instead.
local url=$1
if [ -z "$url" ];then echo "url not defined"; exit 1; fi
local archive_fname=${2:-$(basename $url)}
local arch_sdir="${ARCHIVE_SDIR:-archives}"
# Make the archive directory in case it doesn't exist
mkdir -p $arch_sdir
local out_archive="${arch_sdir}/${archive_fname}"
# If the archive is not already in the archives directory, get it.
if [ ! -f "$out_archive" ]; then
# Source it from multibuild archives if available.
local our_archive="${MULTIBUILD_DIR}/archives/${archive_fname}"
if [ -f "$our_archive" ]; then
ln -s $our_archive $out_archive
else
# Otherwise download it.
curl -L $url > $out_archive
fi
fi
# Unpack archive, refreshing contents, echoing dir and file
# names.
rm_mkdir arch_tmp
install_rsync
(cd arch_tmp && \
untar ../$out_archive && \
ls -1d * &&
rsync --delete -ah * ..)
}
function clean_code {
local repo_dir=${1:-$REPO_DIR}
local build_commit=${2:-$BUILD_COMMIT}
[ -z "$repo_dir" ] && echo "repo_dir not defined" && exit 1
[ -z "$build_commit" ] && echo "build_commit not defined" && exit 1
# The package $repo_dir may be a submodule. git submodules do not
# have a .git directory. If $repo_dir is copied around, tools like
# Versioneer which require that it be a git repository are unable
# to determine the version. Give submodule proper git directory
fill_submodule "$repo_dir"
(cd $repo_dir \
&& git fetch origin \
&& git checkout $build_commit \
&& git clean -fxd \
&& git reset --hard \
&& git submodule update --init --recursive)
}
function build_wheel_cmd {
# Builds wheel with named command, puts into $WHEEL_SDIR
#
# Parameters:
# cmd (optional, default "pip_wheel_cmd"
# Name of command for building wheel
# repo_dir (optional, default $REPO_DIR)
#
# Depends on
# REPO_DIR (or via input argument)
# WHEEL_SDIR (optional, default "wheelhouse")
# BUILD_DEPENDS (optional, default "")
# MANYLINUX_URL (optional, default "") (via pip_opts function)
local cmd=${1:-pip_wheel_cmd}
local repo_dir=${2:-$REPO_DIR}
[ -z "$repo_dir" ] && echo "repo_dir not defined" && exit 1
local wheelhouse=$(abspath ${WHEEL_SDIR:-wheelhouse})
start_spinner
if [ -n "$(is_function "pre_build")" ]; then pre_build; fi
stop_spinner
if [ -n "$BUILD_DEPENDS" ]; then
pip install $(pip_opts) $BUILD_DEPENDS
fi
(cd $repo_dir && $cmd $wheelhouse)
repair_wheelhouse $wheelhouse
}
function pip_wheel_cmd {
local abs_wheelhouse=$1
pip wheel $(pip_opts) -w $abs_wheelhouse --no-deps .
}
function bdist_wheel_cmd {
# Builds wheel with bdist_wheel, puts into wheelhouse
#
# It may sometimes be useful to use bdist_wheel for the wheel building
# process. For example, versioneer has problems with versions which are
# fixed with bdist_wheel:
# https://github.com/warner/python-versioneer/issues/121
local abs_wheelhouse=$1
python setup.py bdist_wheel
cp dist/*.whl $abs_wheelhouse
}
function build_pip_wheel {
# Standard wheel building command with pip wheel
build_wheel_cmd "pip_wheel_cmd" $@
}
function build_bdist_wheel {
# Wheel building with bdist_wheel. See bdist_wheel_cmd
build_wheel_cmd "bdist_wheel_cmd" $@
}
function build_wheel {
# Set default building method to pip
build_pip_wheel $@
}
function build_index_wheel {
# Builds wheel from some index, usually pypi
#
# Parameters:
# project_spec
# requirement to install, e.g. "tornado" or "tornado==4.4.1"
# *args
# Any other arguments to be passed to pip `install` and `wheel`
# commands.
#
# Depends on
# WHEEL_SDIR (optional, default "wheelhouse")
# BUILD_DEPENDS (optional, default "")
# MANYLINUX_URL (optional, default "") (via pip_opts function)
#
# You can also override `pip_opts` command to set indices other than pypi
local project_spec=$1
[ -z "$project_spec" ] && echo "project_spec not defined" && exit 1
# Discard first argument to pass remainder to pip
shift
local wheelhouse=$(abspath ${WHEEL_SDIR:-wheelhouse})
start_spinner
if [ -n "$(is_function "pre_build")" ]; then pre_build; fi
stop_spinner
if [ -n "$BUILD_DEPENDS" ]; then
pip install $(pip_opts) $@ $BUILD_DEPENDS
fi
pip wheel $(pip_opts) $@ -w $wheelhouse --no-deps $project_spec
repair_wheelhouse $wheelhouse
}
function pip_opts {
[ -n "$MANYLINUX_URL" ] && echo "--find-links $MANYLINUX_URL"
}
function get_platform {
# Report platform as given by uname
python -c 'import platform; print(platform.uname()[4])'
}
function get_distutils_platform {
# Report platform as given by distutils get_platform.
# This is the platform tag that pip will use.
python -c "import distutils.util; print(distutils.util.get_platform())"
}
function install_wheel {
# Install test dependencies and built wheel
#
# Pass any input flags to pip install steps
#
# Depends on:
# WHEEL_SDIR (optional, default "wheelhouse")
# TEST_DEPENDS (optional, default "")
# MANYLINUX_URL (optional, default "") (via pip_opts function)
local wheelhouse=$(abspath ${WHEEL_SDIR:-wheelhouse})
if [ -n "$TEST_DEPENDS" ]; then
pip install $(pip_opts) $@ $TEST_DEPENDS
fi
# Install compatible wheel
pip install $(pip_opts) $@ \
$(python $MULTIBUILD_DIR/supported_wheels.py $wheelhouse/*.whl)
}
function install_run {
# Depends on function `run_tests` defined in `config.sh`
install_wheel
mkdir tmp_for_test
(cd tmp_for_test && run_tests)
}
function fill_submodule {
# Restores .git directory to submodule, if necessary
# See:
# https://stackoverflow.com/questions/41776331/is-there-a-way-to-reconstruct-a-git-directory-for-a-submodule
local repo_dir="$1"
[ -z "$repo_dir" ] && echo "repo_dir not defined" && exit 1
local git_loc="$repo_dir/.git"
# For ordinary submodule, .git is a file.
[ -d "$git_loc" ] && return
# Need to recreate .git directory for submodule
local origin_url=$(cd "$repo_dir" && git config --get remote.origin.url)
local repo_copy="$repo_dir-$RANDOM"
git clone --recursive "$repo_dir" "$repo_copy"
rm -rf "$repo_dir"
mv "${repo_copy}" "$repo_dir"
(cd "$repo_dir" && git remote set-url origin $origin_url)
}
PYPY_URL=https://bitbucket.org/pypy/pypy/downloads
# As of 2018-04-25, the latest verions of PyPy.
LATEST_PP_1=1.9
LATEST_PP_2p0=2.0.2
# No minor version numbers for 2.1
LATEST_PP_2p1=2.1
LATEST_PP_2p2=2.2.1
LATEST_PP_2p3=2.3.1
LATEST_PP_2p4=2.4.0
LATEST_PP_2p5=2.5.1
LATEST_PP_2p6=2.6.1
LATEST_PP_2=$LATEST_PP_2p6
LATEST_PP_4p0=4.0.1
LATEST_PP_4=$LATEST_PP_4p0
LATEST_PP_5p0=5.0.1
LATEST_PP_5p1=5.1.1
LATEST_PP_5p3=5.3.1
LATEST_PP_5p4=5.4.1
LATEST_PP_5p6=5.6.0
LATEST_PP_5p7=5.7.1
LATEST_PP_5p8=5.8.0
LATEST_PP_5p9=5.9.0
LATEST_PP_5p10=5.10.1
LATEST_PP_5=$LATEST_PP_5p10
LATEST_PP_6p0=6.0.0
LATEST_PP_6=$LATEST_PP_6p0
function unroll_version {
# Convert major or major.minor format to major.minor.micro using the above
# values recursively
# Parameters:
# $prefix : one of LATEST_PP or LATEST_PP3
# $version : major[.minor[.patch]]
# Hence:
# LATEST_PP 5 -> 5.7.0
# LATEST 2.7 -> 2.7.11
local prefix=$1
local ver=$2
local latest=${prefix}_${ver//./p}
if [ -n "${!latest}" ]; then
echo $(unroll_version ${prefix} ${!latest})
else
echo $ver
fi
}
function fill_pypy_ver {
# Convert major or major.minor format to major.minor.micro
# Parameters:
# $version : major[.minor[.patch]]
# Hence:
# 5 -> 5.7.0
echo $(unroll_version LATEST_PP $1)
}
function get_pypy_build_prefix {
# Return the file prefix of a PyPy file
# Parameters:
# $version : pypy2 version number
local version=$1
if [[ $version =~ ([0-9]+)\.([0-9]+) ]]; then
local major=${BASH_REMATCH[1]}
local minor=${BASH_REMATCH[2]}
if (( $major > 5 || ($major == 5 && $minor >= 3) )); then
echo "pypy2-v"
else
echo "pypy-"
fi
else
echo "error: expected version number, got $1" 1>&2
exit 1
fi
}
retry () {
# Retry command (with arguments) up to 5 times
# https://gist.github.com/fungusakafungus/1026804
local retry_max=5
local count=$retry_max
while [ $count -gt 0 ]; do
"$@" && break
count=$(($count - 1))
sleep 1
done
[ $count -eq 0 ] && {
echo "Retry failed [$retry_max]: $@" >&2
return 1
}
return 0
}
cython-blis-1.0.0/bin/travis_setup.sh 0000775 0000000 0000000 00000000670 14634250137 0017612 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
set -e
if [ "$TRAVIS_OS_NAME" = "linux" ]; then
sudo systemctl disable apt-daily.timer
sudo killall apt.systemd.daily
sleep 5
sudo -E apt-add-repository -y "ppa:ubuntu-toolchain-r/test"
sleep 5
sudo apt-get update -y
sleep 5
sudo apt-get install -y gcc-6 binutils clang
sed -i 's/"gcc"/"gcc-6"/' blis/_src/make/linux-x86_64.jsonl
export CC="gcc-6"
fi
#if [ "$TRAVIS_OS_NAME" = "osx" ]; then
#fi
cython-blis-1.0.0/bin/update-vendored-source 0000775 0000000 0000000 00000001201 14634250137 0021024 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
set -e
rm -rf blis/_new_src
mkdir blis/_new_src
cp -r flame-blis/config blis/_new_src/config
cp -r flame-blis/frame blis/_new_src/frame
cp -r flame-blis/kernels blis/_new_src/kernels
cp -r flame-blis/ref_kernels blis/_new_src/ref_kernels
mkdir blis/_new_src/include
mkdir blis/_new_src/include/darwin-x86_64_no_skx
mkdir blis/_new_src/include/linux-x86_64_no_skx
mkdir blis/_new_src/include/windows-x86_64_no_skx
mkdir blis/_new_src/include/darwin-generic
mkdir blis/_new_src/include/linux-generic
mkdir blis/_new_src/include/windows-generic
mkdir blis/_new_src/make
mv blis/_src _old_src
mv blis/_new_src blis/_src
cython-blis-1.0.0/blis/ 0000775 0000000 0000000 00000000000 14634250137 0014701 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/__init__.pxd 0000664 0000000 0000000 00000000000 14634250137 0017143 0 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/__init__.py 0000664 0000000 0000000 00000000151 14634250137 0017007 0 ustar 00root root 0000000 0000000 # Copyright (c) 2017 - 2022 ExplosionAI GmbH, released under BSD-3-Clause.
from .cy import init
init()
cython-blis-1.0.0/blis/_src/ 0000775 0000000 0000000 00000000000 14634250137 0015627 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/ 0000775 0000000 0000000 00000000000 14634250137 0017074 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/README.md 0000664 0000000 0000000 00000001467 14634250137 0020363 0 ustar 00root root 0000000 0000000
For more information on sub-configurations and configuration families in BLIS,
please read the Configuration Guide, which can be viewed in markdown-rendered
form [from the BLIS wiki page](https://github.com/flame/blis/wiki/).
If you don't have time, or are impatient, take a look at the `config_registry`
file in the top-level directory of the BLIS distribution. It contains a
grammar-like mapping of configuration names, or families, to sub-configurations,
which may be other families. Keep in mind that the `/` notation:
```
: /
```
means that the kernel set associated with `` should be made available to
the configuration `` if `` is targeted at configure-time.
(Some configurations borrow kernels from other configurations, and this is how
we specify that requirement.)
cython-blis-1.0.0/blis/_src/config/a64fx/ 0000775 0000000 0000000 00000000000 14634250137 0020024 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/a64fx/bli_a64fx_sector_cache.h 0000664 0000000 0000000 00000010457 14634250137 0024464 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Forschunszentrum Juelich
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// A64FX: set up cache sizes
//
// Reference: A64FX (TM) specification Fujitsu HPC Extension
// Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf
//
// 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 |
// RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max |
//
// the bits set number of maximum sectors from 0-7
// 000 - 0
// 001 - 1
// 010 - 2
// 011 - 3
// 100 - 4
// 101 - 5
// 110 - 6
// 111 - 7
//
// For L1 we want to maximize the number of sectors for B
// Configuration 1: 1 sector for C (sector 3)
// 1 sector for A (sector 1)
// 6 sectors for B (sector 2)
// 0 sectors for the rest (sector 0)
//
// 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000
//
// Configuration 2: 1 sector for C (sector 3)
// 1 sector for A (sector 1)
// 5 sectors for B (sector 2)
// 1 sectors for the rest (sector 0)
//
// 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001
//
// accessing the control register:
//
// MRS , S3_3_C11_C8_2
// MSR S3_3_C11_C8_2,
//
// TODO: First tests showed no change in performance, a deeper investigation
// is necessary
#define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\
{\
uint64_t sector_cache_config = config_bitfield;\
__asm__ volatile(\
"msr s3_3_c11_c8_2,%[sector_cache_config]"\
:\
: [sector_cache_config] "r" (sector_cache_config)\
:\
);\
}
#define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\
{\
uint64_t sector_cache_config = config_bitfield;\
__asm__ volatile(\
"msr s3_3_c15_c8_2,%[sector_cache_config]"\
:\
: [sector_cache_config] "r" (sector_cache_config)\
:\
);\
}
#define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\
" mov "#sparereg", "#tag" \n\t"\
" lsl "#sparereg", "#sparereg", 56 \n\t"\
" orr "#areg", "#areg", "#sparereg" \n\t"
#define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\
__asm__ volatile(\
"mrs %["#output_uint64"],s3_3_c11_c8_2"\
: [output_uint64] "=r" (output_uint64)\
: \
:\
);
#define A64FX_SCC(sec0,sec1,sec2,sec3)\
(uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12))
#define A64FX_SCC_L2(sec02,sec13)\
(uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8))
cython-blis-1.0.0/blis/_src/config/a64fx/bli_cntx_init_a64fx.c 0000664 0000000 0000000 00000013254 14634250137 0024032 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "bli_a64fx_sector_cache.h"
void bli_cntx_init_a64fx( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_a64fx_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);
// Set SVE-512 packing routine.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
// 12xk is not used and disabled for GCC 8-9 compatibility.
// BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 16, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, 10, 10 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 192, 96 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, 1536, 1536 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, 11520, 11760 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
// Set A64FX cache sector sizes for each PE/CMG
// SC Fugaku might disable users' setting cache sizes.
#if !defined(CACHE_SECTOR_SIZE_READONLY)
#pragma omp parallel
{
A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0))
A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28))
}
#endif
}
cython-blis-1.0.0/blis/_src/config/a64fx/bli_family_a64fx.h 0000664 0000000 0000000 00000004203 14634250137 0023313 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
// SVE-specific configs.
#define N_L1_SVE_DEFAULT 64
#define W_L1_SVE_DEFAULT 4
#define C_L1_SVE_DEFAULT 256
#define N_L2_SVE_DEFAULT 2048
#define W_L2_SVE_DEFAULT 16
#define C_L2_SVE_DEFAULT 256
#define N_L3_SVE_DEFAULT 8192
#define W_L3_SVE_DEFAULT 16
#define C_L3_SVE_DEFAULT 256
//#endif
cython-blis-1.0.0/blis/_src/config/a64fx/make_defs.mk 0000664 0000000 0000000 00000005552 14634250137 0022302 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := a64fx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/amd64/ 0000775 0000000 0000000 00000000000 14634250137 0020007 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/amd64/bli_family_amd64.h 0000664 0000000 0000000 00000003332 14634250137 0023263 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_FAMILY_AMD64_H
#define BLIS_FAMILY_AMD64_H
#endif
cython-blis-1.0.0/blis/_src/config/amd64/make_defs.mk 0000664 0000000 0000000 00000005027 14634250137 0022262 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := amd64
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Setting for reference and optimized kernels are taken from individual
# subconfiguration makefile fragments in this family.
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/amd64_legacy/ 0000775 0000000 0000000 00000000000 14634250137 0021333 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/amd64_legacy/bli_family_amd64_legacy.h 0000664 0000000 0000000 00000003513 14634250137 0026134 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_FAMILY_AMD64_LEGACY_H
#define BLIS_FAMILY_AMD64_LEGACY_H
// Placeholder for bundle configuration.
#endif
cython-blis-1.0.0/blis/_src/config/amd64_legacy/make_defs.mk 0000664 0000000 0000000 00000005133 14634250137 0023604 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := amd64_legacy
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Setting for reference and optimized kernels are taken from individual
# subconfiguration makefile fragments in this family.
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/arm32/ 0000775 0000000 0000000 00000000000 14634250137 0020020 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/arm32/bli_family_arm32.h 0000664 0000000 0000000 00000003472 14634250137 0023312 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
//#endif
cython-blis-1.0.0/blis/_src/config/arm32/make_defs.mk 0000664 0000000 0000000 00000005666 14634250137 0022304 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := arm32
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS := -mfloat-abi=hard -mfpu=neon
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -march=armv7-a
else
$(error gcc is required for this configuration.)
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/arm64/ 0000775 0000000 0000000 00000000000 14634250137 0020025 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/arm64/bli_family_arm64.h 0000664 0000000 0000000 00000004174 14634250137 0023324 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
// SVE-specific configs.
#define N_L1_SVE_DEFAULT 64
#define W_L1_SVE_DEFAULT 4
#define C_L1_SVE_DEFAULT 256
#define N_L2_SVE_DEFAULT 2048
#define W_L2_SVE_DEFAULT 16
#define C_L2_SVE_DEFAULT 256
#define N_L3_SVE_DEFAULT 8192
#define W_L3_SVE_DEFAULT 16
#define C_L3_SVE_DEFAULT 256
//#endif
cython-blis-1.0.0/blis/_src/config/arm64/make_defs.mk 0000664 0000000 0000000 00000005767 14634250137 0022313 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := arm64
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -march=armv8-a
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -march=armv8-a
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/armsve/ 0000775 0000000 0000000 00000000000 14634250137 0020371 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/armsve/bli_cntx_init_armsve.c 0000664 0000000 0000000 00000014036 14634250137 0024743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
void bli_cntx_init_armsve( cntx_t* cntx )
{
if (!(getauxval( AT_HWCAP ) & HWCAP_SVE))
return;
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
#if 0
blksz_t thresh[ BLIS_NUM_THRESH ];
#endif
// Set default kernel blocksizes and functions.
bli_cntx_init_armsve_ref( cntx );
// -------------------------------------------------------------------------
// Block size.
dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s;
dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d;
dim_t m_r_c, n_r_c, k_c_c, m_c_c, n_c_c;
dim_t m_r_z, n_r_z, k_c_z, m_c_z, n_c_z;
bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s);
bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d);
bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
// These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
cntx
);
// Set VL-specific packing routines if applicable.
if (m_r_d==16)
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
cntx
);
else if (m_r_d==8)
bli_cntx_set_packm_kers
(
1,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, m_r_c, m_r_z );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, n_r_c, n_r_z );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, m_c_c, m_c_z );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, k_c_c, k_c_z );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, n_c_c, n_c_z );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
#if 0
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
4,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
#endif
}
cython-blis-1.0.0/blis/_src/config/armsve/bli_family_armsve.h 0000664 0000000 0000000 00000004203 14634250137 0024225 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
// SVE-specific configs.
#define N_L1_SVE_DEFAULT 64
#define W_L1_SVE_DEFAULT 4
#define C_L1_SVE_DEFAULT 256
#define N_L2_SVE_DEFAULT 2048
#define W_L2_SVE_DEFAULT 16
#define C_L2_SVE_DEFAULT 256
#define N_L3_SVE_DEFAULT 8192
#define W_L3_SVE_DEFAULT 16
#define C_L3_SVE_DEFAULT 256
//#endif
cython-blis-1.0.0/blis/_src/config/armsve/make_defs.mk 0000664 0000000 0000000 00000005542 14634250137 0022646 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := armsve
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/bgq/ 0000775 0000000 0000000 00000000000 14634250137 0017645 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/bgq/bli_cntx_init_bgq.c 0000664 0000000 0000000 00000006031 14634250137 0023467 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_bgq( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_bgq_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 1024, 0, 768 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 2048, 0, 1536 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 10240, 0, 10240 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/bgq/bli_family_bgq.h 0000664 0000000 0000000 00000006213 14634250137 0022760 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#undef restrict
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MC_S 1024
#define BLIS_DEFAULT_KC_S 2048
#define BLIS_DEFAULT_NC_S 8192
// 1 MPI RANK CASE:
#define BLIS_DGEMM_UKERNEL bli_dgemm_int_8x8
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DEFAULT_MC_D 1024
#define BLIS_DEFAULT_KC_D 2048
#define BLIS_DEFAULT_NC_D 10240
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MC_C 1024
#define BLIS_DEFAULT_KC_C 2048
#define BLIS_DEFAULT_NC_C 8192
#define BLIS_ZGEMM_UKERNEL bli_zgemm_int_8x8
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_DEFAULT_MC_Z 768
#define BLIS_DEFAULT_KC_Z 1536
#define BLIS_DEFAULT_NC_Z 10240
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
#define BLIS_DEFAULT_AF_D 8
#define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/bgq/make_defs.mk 0000664 0000000 0000000 00000007136 14634250137 0022123 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := bgq
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
#ifeq ($(CC),)
#CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r
#CC_VENDOR := ibm
#endif
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk
ifeq ($(CC_VENDOR),ibm)
CMISCFLAGS := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist
else ifeq ($(CC_VENDOR),clang)
CMISCFLAGS := -fopenmp
else
$(error xlc or bgclang is required for this configuration.)
endif
CPICFLAGS :=
CWARNFLAGS := -w
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),ibm)
CKVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Override the default value for LDFLAGS.
ifeq ($(CC_VENDOR),ibm)
LDFLAGS := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -qthreaded -qsmp=omp
else ifeq ($(CC_VENDOR),clang)
LDFLAGS := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -fopenmp
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/bulldozer/ 0000775 0000000 0000000 00000000000 14634250137 0021076 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/bulldozer/bli_cntx_init_bulldozer.c 0000664 0000000 0000000 00000006317 14634250137 0026160 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_bulldozer( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_bulldozer_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, 4, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 128, 1080, 96, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 120, 256, 192 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 8400, 4096, 4096 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/bulldozer/bli_family_bulldozer.h 0000664 0000000 0000000 00000005423 14634250137 0025444 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4
#define BLIS_DEFAULT_MC_S 128
#define BLIS_DEFAULT_KC_S 384
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 8
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4
#define BLIS_DEFAULT_MC_D 1080
#define BLIS_DEFAULT_KC_D 120
#define BLIS_DEFAULT_NC_D 8400
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 6
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4
#define BLIS_DEFAULT_MC_C 96
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 192
#define BLIS_DEFAULT_NC_Z 4096
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 4
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/bulldozer/make_defs.mk 0000664 0000000 0000000 00000006126 14634250137 0023352 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := bulldozer
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/cortexa15/ 0000775 0000000 0000000 00000000000 14634250137 0020707 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/cortexa15/bli_cntx_init_cortexa15.c 0000664 0000000 0000000 00000006646 14634250137 0025607 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_cortexa15( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_cortexa15_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
#if 1
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 4, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 176, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 368, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4096, -1, -1 );
#endif
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/cortexa15/bli_family_cortexa15.h 0000664 0000000 0000000 00000005572 14634250137 0025073 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_armv7a_int_4x4
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MC_S 336
#define BLIS_DEFAULT_KC_S 528
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DGEMM_UKERNEL bli_dgemm_armv7a_int_4x4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MC_D 176
#define BLIS_DEFAULT_KC_D 368
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/cortexa15/make_defs.mk 0000664 0000000 0000000 00000005674 14634250137 0023172 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := cortexa15
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS := -mfloat-abi=hard -mfpu=neon
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a15
else
$(error gcc is required for this configuration.)
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/cortexa53/ 0000775 0000000 0000000 00000000000 14634250137 0020711 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/cortexa53/bli_cntx_init_cortexa53.c 0000664 0000000 0000000 00000006055 14634250137 0025605 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_cortexa53( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_cortexa53_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/cortexa53/bli_family_cortexa53.h 0000664 0000000 0000000 00000003410 14634250137 0025064 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
cython-blis-1.0.0/blis/_src/config/cortexa53/make_defs.mk 0000664 0000000 0000000 00000006045 14634250137 0023165 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := cortexa53
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -mcpu=cortex-a53
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a53
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=cortex-a53
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/cortexa57/ 0000775 0000000 0000000 00000000000 14634250137 0020715 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/cortexa57/bli_cntx_init_cortexa57.c 0000664 0000000 0000000 00000006055 14634250137 0025615 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_cortexa57( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_cortexa57_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/cortexa57/bli_family_cortexa57.h 0000664 0000000 0000000 00000006024 14634250137 0025100 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 12
#define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336
#define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528
#define BLIS_DEFAULT_NC_S 3072
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176
#define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368
#define BLIS_DEFAULT_NC_D 3072
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/cortexa57/make_defs.mk 0000664 0000000 0000000 00000006041 14634250137 0023165 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := cortexa57
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -mcpu=cortex-a57
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a57
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=cortex-a57
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/cortexa9/ 0000775 0000000 0000000 00000000000 14634250137 0020632 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/cortexa9/bli_cntx_init_cortexa9.c 0000664 0000000 0000000 00000006051 14634250137 0025443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_cortexa9( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_cortexa9_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 176, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 368, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/cortexa9/bli_family_cortexa9.h 0000664 0000000 0000000 00000005566 14634250137 0024744 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_armv7a_int_4x4
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MC_S 432
#define BLIS_DEFAULT_KC_S 352
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DGEMM_UKERNEL bli_dgemm_armv7a_int_4x4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MC_D 176
#define BLIS_DEFAULT_KC_D 368
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/cortexa9/make_defs.mk 0000664 0000000 0000000 00000005672 14634250137 0023113 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := cortexa9
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS := -mfloat-abi=hard -mfpu=neon
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=cortex-a9
else
$(error gcc is required for this configuration.)
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/excavator/ 0000775 0000000 0000000 00000000000 14634250137 0021070 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/excavator/bli_cntx_init_excavator.c 0000664 0000000 0000000 00000006303 14634250137 0026137 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_excavator( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_excavator_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 3, 3, 2, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 528, 264, 264, 100 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 320 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8400, 8400, 8400, 8400 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/excavator/bli_family_excavator.h 0000664 0000000 0000000 00000005735 14634250137 0025436 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 3
#define BLIS_DEFAULT_MC_S 528
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8400
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 3
#define BLIS_DEFAULT_MC_D 264
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 8400
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2
#define BLIS_DEFAULT_MR_C 4
#define BLIS_DEFAULT_NR_C 2
#define BLIS_DEFAULT_MC_C 264
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 8400
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 2
#define BLIS_DEFAULT_MC_Z 100
#define BLIS_DEFAULT_KC_Z 320
#define BLIS_DEFAULT_NC_Z 8400
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/excavator/make_defs.mk 0000664 0000000 0000000 00000006150 14634250137 0023341 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := excavator
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/firestorm/ 0000775 0000000 0000000 00000000000 14634250137 0021106 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/firestorm/bli_cntx_init_firestorm.c 0000664 0000000 0000000 00000012762 14634250137 0026201 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_firestorm( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_firestorm_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
4,
BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk,
BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
cython-blis-1.0.0/blis/_src/config/firestorm/bli_family_firestorm.h 0000664 0000000 0000000 00000006024 14634250137 0025462 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 12
#define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336
#define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528
#define BLIS_DEFAULT_NC_S 3072
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176
#define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368
#define BLIS_DEFAULT_NC_D 3072
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/firestorm/make_defs.mk 0000664 0000000 0000000 00000005570 14634250137 0023364 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := firestorm
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -march=armv8-a
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
CKVECFLAGS := -march=armv8-a
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/generic/ 0000775 0000000 0000000 00000000000 14634250137 0020510 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/generic/bli_cntx_init_generic.c 0000664 0000000 0000000 00000003442 14634250137 0025200 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_generic( cntx_t* cntx )
{
// Set default kernel blocksizes and functions.
bli_cntx_init_generic_ref( cntx );
}
cython-blis-1.0.0/blis/_src/config/generic/bli_family_generic.h 0000664 0000000 0000000 00000003307 14634250137 0024467 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/generic/make_defs.mk 0000664 0000000 0000000 00000006010 14634250137 0022754 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := generic
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS :=
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS :=
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS :=
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/haswell/ 0000775 0000000 0000000 00000000000 14634250137 0020533 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/haswell/bli_cntx_init_haswell.c 0000664 0000000 0000000 00000023214 14634250137 0025245 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
void bli_cntx_init_haswell( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_haswell_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
#if 1
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
#else
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE,
#endif
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
#if 1
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
#if 1
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1008, 1008, 1008, 1008 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 6, 6, 3, 3 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1024, 1024, 1024, 1024 );
//bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 64, 56, 32 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 72, 56, 44 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
#endif
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
cython-blis-1.0.0/blis/_src/config/haswell/bli_family_haswell.h 0000664 0000000 0000000 00000011421 14634250137 0024531 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
// -- sgemm micro-kernel --
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 24
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 6
#endif
// -- dgemm micro-kernel --
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12
#define BLIS_DEFAULT_MC_D 152
#define BLIS_DEFAULT_KC_D 160
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 12
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 6
#endif
// -- cgemm micro-kernel --
#if 1
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 3
#define BLIS_DEFAULT_NR_C 8
#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 3
#endif
// -- zgemm micro-kernel --
#if 1
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 3
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 3
#endif
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/haswell/make_defs.mk 0000664 0000000 0000000 00000006632 14634250137 0023011 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := haswell
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -O3 -fomit-frame-pointer
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
ifeq ($(GCC_OT_4_9_0),yes)
# If gcc is older than 4.9.0, we must use a different label for -march.
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
endif
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xCORE-AVX2
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/intel64/ 0000775 0000000 0000000 00000000000 14634250137 0020361 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/intel64/bli_family_intel64.h 0000664 0000000 0000000 00000003306 14634250137 0024210 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/intel64/make_defs.mk 0000664 0000000 0000000 00000006124 14634250137 0022633 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := intel64
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/intel64_no_skx/ 0000775 0000000 0000000 00000000000 14634250137 0021742 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/intel64_no_skx/bli_family_intel64.h 0000664 0000000 0000000 00000003306 14634250137 0025571 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/intel64_no_skx/bli_family_intel64_no_skx.h 0000664 0000000 0000000 00000003306 14634250137 0027152 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/intel64_no_skx/make_defs.mk 0000664 0000000 0000000 00000005664 14634250137 0024224 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := intel64_no_skx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS)
else
CRVECFLAGS := $(CKVECFLAGS)
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/knc/ 0000775 0000000 0000000 00000000000 14634250137 0017647 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/knc/bli_cntx_init_knc.c 0000664 0000000 0000000 00000006150 14634250137 0023475 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_knc( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_knc_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
1,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 30, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0,
0, 160, 0, 0 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0,
0, 300, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/knc/bli_family_knc.h 0000664 0000000 0000000 00000006724 14634250137 0022773 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- THREADING PARAMTERS ------------------------------------------------------
#define BLIS_TREE_BARRIER
#define BLIS_TREE_BARRIER_ARITY 4
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 64
#define BLIS_SIMD_MAX_SIZE 64
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_30x16
#define BLIS_DEFAULT_MR_S 30
#define BLIS_DEFAULT_NR_S 16
#define BLIS_DEFAULT_MC_S 240
#define BLIS_DEFAULT_KC_S 240
#define BLIS_DEFAULT_NC_S 14400
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_30x8
#define BLIS_DEFAULT_MR_D 30
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DEFAULT_MC_D 120
#define BLIS_DEFAULT_KC_D 240
#define BLIS_DEFAULT_NC_D 14400
#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0)
#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0)
#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + 2)
//#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...)
#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + 2)
//#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...)
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/knc/make_defs.mk 0000664 0000000 0000000 00000006101 14634250137 0022114 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := knc
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS := -mmic -fasm-blocks
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS :=
else
$(error icc is required for this configuration.)
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Override the default value for LDFLAGS.
LDFLAGS := -mmic
# Never use libm with Intel compilers.
ifneq ($(CC_VENDOR),icc)
LDFLAGS += $(LIBM)
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/knl/ 0000775 0000000 0000000 00000000000 14634250137 0017660 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/knl/bli_cntx_init_knl.c 0000664 0000000 0000000 00000011670 14634250137 0023522 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_knl( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_knl_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 24, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_MC ], 240, 120, -1, -1,
288, 144, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 336, 336, -1, -1,
408, 408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 14400, 14400, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/knl/bli_family_knl.h 0000664 0000000 0000000 00000011617 14634250137 0023012 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- THREADING PARAMETERS -----------------------------------------------------
#define BLIS_THREAD_RATIO_M 4
#define BLIS_THREAD_RATIO_N 1
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
// -- MEMORY ALLOCATION --------------------------------------------------------
//#define BLIS_TREE_BARRIER
//#define BLIS_TREE_BARRIER_ARITY 4
#define BLIS_SIMD_ALIGN_SIZE 64
#define BLIS_SIMD_MAX_SIZE 64
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
/*
#ifdef BLIS_NO_HBWMALLOC
#include
#define BLIS_MALLOC_POOL malloc
#define BLIS_FREE_POOL free
#else
#include
#define BLIS_MALLOC_POOL hbw_malloc
#define BLIS_FREE_POOL hbw_free
#endif
*/
//#define BLIS_MALLOC_INTL hbw_malloc
//#define BLIS_FREE_INTL hbw_free
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc
#define BLIS_DEFAULT_MC_S 240
#define BLIS_DEFAULT_KC_S 240
#define BLIS_DEFAULT_NC_S 14400
#define BLIS_DEFAULT_MR_S 30
#define BLIS_DEFAULT_NR_S 16
#define BLIS_PACKDIM_MR_S 32
#define BLIS_PACKDIM_NR_S 16
#if 0
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc
#define BLIS_DEFAULT_MC_D 120
#define BLIS_DEFAULT_KC_D 240
#define BLIS_DEFAULT_NC_D 14400
#define BLIS_DEFAULT_MR_D 30
#define BLIS_DEFAULT_NR_D 8
#define BLIS_PACKDIM_MR_D 32
#define BLIS_PACKDIM_NR_D 8
#elif 0
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8
#define BLIS_DEFAULT_MC_D 120
#define BLIS_DEFAULT_KC_D 240
#define BLIS_DEFAULT_NC_D 14400
#define BLIS_DEFAULT_MR_D 30
#define BLIS_DEFAULT_NR_D 8
#define BLIS_PACKDIM_MR_D 32
#define BLIS_PACKDIM_NR_D 8
#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt
#define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt
#else
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8
#define BLIS_DEFAULT_MR_D 24
#define BLIS_DEFAULT_NR_D 8
#define BLIS_PACKDIM_MR_D 24
#define BLIS_PACKDIM_NR_D 8
#define BLIS_DEFAULT_MC_D 120
#define BLIS_DEFAULT_KC_D 336
#define BLIS_DEFAULT_NC_D 14400
#define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt
#define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt
#endif
#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0)
#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0)
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/knl/make_defs.mk 0000664 0000000 0000000 00000007776 14634250137 0022150 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := knl
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
ifeq ($(DEBUG_TYPE),sde)
# Unconditionally disable use of libmemkind in Intel SDE.
# Note: The BLIS_DISABLE_MEMKIND macro definition will override
# (undefine) the BLIS_ENABLE_MEMKIND macro definition.
CPPROCFLAGS += -DBLIS_DISABLE_MEMKIND
# This value is normally set by configure and communicated to make via
# config.mk, however, the make_defs.mk files (this file) get included
# after config.mk, so this definition will override that earlier
# definition.
MK_ENABLE_MEMKIND := no
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx512f -mavx512pf -mfpmath=sse -march=knl
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xMIC-AVX512
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx512f -mavx512pf -mfpmath=sse -march=knl
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# The assembler on OS X won't recognize AVX512 without help.
ifneq ($(CC_VENDOR),icc)
ifeq ($(OS_NAME),Darwin)
CKVECFLAGS += -Wa,-march=knl
endif
endif
# Flags specific to reference kernels.
# Note: We use AVX2 for reference kernels instead of AVX-512.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),icc)
CRVECFLAGS := -xMIC-AVX512
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/old/ 0000775 0000000 0000000 00000000000 14634250137 0017652 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/armv7a/ 0000775 0000000 0000000 00000000000 14634250137 0021047 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/armv7a/bli_cntx_init_armv7a.c 0000664 0000000 0000000 00000006243 14634250137 0025322 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_armv7a( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_armv7a_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_asm_4x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_asm_4x4, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armv7a_asm_2x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armv7a_asm_2x2, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 2, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 2, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 192, 64, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 256, 128, 128 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/old/armv7a/bli_family_armv7a.h 0000664 0000000 0000000 00000005770 14634250137 0024615 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_FAMILY_H
#define BLIS_FAMILY_H
// -- ARCHITECTURE-SPECIFIC PROTOTYPES -----------------------------------------
// Define the current architecture's name.
#define archname armv7a
// Include the context initialization function API template.
#include "bli_cntx_init_arch.h"
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MC_S 432
#define BLIS_DEFAULT_KC_S 352
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MC_D 192
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4
#define BLIS_DEFAULT_MR_C 2
#define BLIS_DEFAULT_NR_C 2
#define BLIS_DEFAULT_MC_C 64
#define BLIS_DEFAULT_KC_C 128
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 2
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 128
#define BLIS_DEFAULT_NC_Z 4096
#endif
#endif
cython-blis-1.0.0/blis/_src/config/old/armv7a/make_defs.mk 0000664 0000000 0000000 00000005213 14634250137 0023317 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := armv7a
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 -mfloat-abi=hard
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpu=vfpv3 -marm -march=armv7-a
else
$(error gcc is required for this configuration.)
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/old/emscripten/ 0000775 0000000 0000000 00000000000 14634250137 0022023 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/emscripten/bli_kernel.h 0000664 0000000 0000000 00000014616 14634250137 0024312 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
/* Use the same parameters as non-SIMD PNaCl */
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
// -- Cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DEFAULT_MC_S 252
#define BLIS_DEFAULT_KC_S 264
#define BLIS_DEFAULT_NC_S 8196
#define BLIS_DEFAULT_MC_D 1080
#define BLIS_DEFAULT_KC_D 120
#define BLIS_DEFAULT_NC_D 8400
#define BLIS_DEFAULT_MC_C 120
#define BLIS_DEFAULT_KC_C 264
#define BLIS_DEFAULT_NC_C 4092
#define BLIS_DEFAULT_MC_Z 60
#define BLIS_DEFAULT_KC_Z 264
#define BLIS_DEFAULT_NC_Z 2040
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 3
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 3
#define BLIS_DEFAULT_MR_C 2
#define BLIS_DEFAULT_NR_C 3
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 3
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Maximum cache blocksizes (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// larger than the default blocksizes, blocksizes used at edge cases are
// enlarged if such an extension would encompass the remaining portion of
// the matrix dimension.
//#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
//#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
//#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + BLIS_DEFAULT_NC_S/4)
//#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
//#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
//#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + BLIS_DEFAULT_NC_D/4)
//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
// -- Packing register blocksize (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
//#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + ...)
//#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...)
//#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + ...)
//#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...)
//#define BLIS_PACKDIM_MR_C (BLIS_DEFAULT_MR_C + ...)
//#define BLIS_PACKDIM_NR_C (BLIS_DEFAULT_NR_C + ...)
//#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...)
//#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
// -- trsm-related --
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
// -- dotaxpyv --
// -- axpyf --
// -- dotxf --
// -- dotxaxpyf --
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
// -- axpyv --
// -- copyv --
// -- dotv --
// -- dotxv --
// -- invertv --
// -- scal2v --
// -- scalv --
// -- setv --
// -- subv --
// -- swapv --
#endif
cython-blis-1.0.0/blis/_src/config/old/emscripten/make_defs.mk 0000664 0000000 0000000 00000005305 14634250137 0024275 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := emcc
CC_VENDOR := emcc
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CDBGFLAGS := #-g4
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
COPTFLAGS := -O2
CKOPTFLAGS := -O3
CKVECFLAGS :=
# --- Determine the archiver and related flags ---
AR := emar
RANLIB := emranlib
ARFLAGS := cr
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS := -shared
LDFLAGS := -O3 -s TOTAL_MEMORY=67108864 -s FORCE_ALIGNED_MEMORY=1 -s PRECISE_F32=2 -s GC_SUPPORT=0
# --- Determine JS interpreter ---
JSINT := node
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif
cython-blis-1.0.0/blis/_src/config/old/haswellbb/ 0000775 0000000 0000000 00000000000 14634250137 0021615 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/haswellbb/bli_cntx_init_haswell.c 0000664 0000000 0000000 00000025701 14634250137 0026332 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( float, s, packm_6xk_bb4_haswell_ref )
PACKM_KER_PROT( double, d, packm_6xk_bb2_haswell_ref )
// Instantiate prototypes for level-3 kernels.
GEMM_UKR_PROT( float, s, gemmbb_haswell_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_haswell_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_haswell_ref )
TRSM_UKR_PROT( float, s, trsmbb_l_haswell_ref )
TRSM_UKR_PROT( float, s, trsmbb_u_haswell_ref )
GEMM_UKR_PROT( double, d, gemmbb_haswell_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_haswell_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_haswell_ref )
TRSM_UKR_PROT( double, d, trsmbb_l_haswell_ref )
TRSM_UKR_PROT( double, d, trsmbb_u_haswell_ref )
GEMM_UKR_PROT( scomplex, c, gemmbb_haswell_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_haswell_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_haswell_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_l_haswell_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_u_haswell_ref )
GEMM_UKR_PROT( dcomplex, z, gemmbb_haswell_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_haswell_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_haswell_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_haswell_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_haswell_ref )
void bli_cntx_init_haswell( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_haswell_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
#if 0
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
#else
12,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_haswell_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_haswell_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_haswell_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_haswell_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_haswell_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_haswell_ref, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_haswell_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_haswell_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_haswell_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_haswell_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_haswell_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_haswell_ref, FALSE,
#endif
cntx
);
// Update the context with customized virtual [gemm]trsm micro-kernels.
bli_cntx_set_l3_vir_ukrs
(
8,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_haswell_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_haswell_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_haswell_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_haswell_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_haswell_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_haswell_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_haswell_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_haswell_ref,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_haswell_ref,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_haswell_ref,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
#if 0
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 12, 12, 6 );
bli_blksz_init ( &blkszs[ BLIS_NR ], 6, 6, 6, 6,
24, 12, 6, 6 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 2076 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 1, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 1, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 1, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
8,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1,
-1, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
cython-blis-1.0.0/blis/_src/config/old/haswellbb/bli_family_haswell.h 0000664 0000000 0000000 00000012260 14634250137 0025615 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 32
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 64
// Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of
// elements within the packed matrix B.
#define BLIS_DISABLE_HEMM_RIGHT
#define BLIS_DISABLE_SYMM_RIGHT
#define BLIS_DISABLE_TRMM_RIGHT
#define BLIS_DISABLE_TRMM3_RIGHT
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
// -- sgemm micro-kernel --
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 24
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 6
#endif
// -- dgemm micro-kernel --
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12
#define BLIS_DEFAULT_MC_D 152
#define BLIS_DEFAULT_KC_D 160
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 12
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 1
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 6
#endif
// -- cgemm micro-kernel --
#if 1
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 3
#define BLIS_DEFAULT_NR_C 8
#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 3
#endif
// -- zgemm micro-kernel --
#if 1
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 3
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
#if 0
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 3
#endif
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/old/haswellbb/make_defs.mk 0000664 0000000 0000000 00000006410 14634250137 0024065 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := haswell
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
ifeq ($(GCC_OT_4_9_0),yes)
# If gcc is older than 4.9.0, we must use a different label for -march.
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2
endif
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xCORE-AVX2
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/old/loongson3a/ 0000775 0000000 0000000 00000000000 14634250137 0021734 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/loongson3a/bli_kernel.h 0000664 0000000 0000000 00000014623 14634250137 0024221 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
// -- Cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#define BLIS_DEFAULT_MC_D 32
#define BLIS_DEFAULT_KC_D 128
#define BLIS_DEFAULT_NC_D 1024
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 2048
// -- Register blocksizes --
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_DEFAULT_MR_Z 8
#define BLIS_DEFAULT_NR_Z 4
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Maximum cache blocksizes (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// larger than the default blocksizes, blocksizes used at edge cases are
// enlarged if such an extension would encompass the remaining portion of
// the matrix dimension.
//#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
//#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
//#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + BLIS_DEFAULT_NC_S/4)
//#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
//#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
//#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + BLIS_DEFAULT_NC_D/4)
//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
// -- Packing register blocksize (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
//#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + ...)
//#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...)
//#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + ...)
//#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...)
//#define BLIS_PACKDIM_MR_C (BLIS_DEFAULT_MR_C + ...)
//#define BLIS_PACKDIM_NR_C (BLIS_DEFAULT_NR_C + ...)
//#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...)
//#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4
// -- trsm-related --
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
// -- dotaxpyv --
// -- axpyf --
// -- dotxf --
// -- dotxaxpyf --
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
// -- axpyv --
// -- copyv --
// -- dotv --
// -- dotxv --
// -- invertv --
// -- scal2v --
// -- scalv --
// -- setv --
// -- subv --
// -- swapv --
#endif
cython-blis-1.0.0/blis/_src/config/old/loongson3a/make_defs.mk 0000664 0000000 0000000 00000005212 14634250137 0024203 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := loongson3a
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3 -mtune=loongson3a
endif
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -march=loongson3a
else
$(error gcc is required for this configuration.)
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/old/newarch/ 0000775 0000000 0000000 00000000000 14634250137 0021301 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/newarch/bli_kernel.h 0000664 0000000 0000000 00000003300 14634250137 0023554 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
#endif
cython-blis-1.0.0/blis/_src/config/old/newarch/make_defs.mk 0000664 0000000 0000000 00000005342 14634250137 0023554 0 ustar 00root root 0000000 0000000 #!/bin/bash
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := newarch
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
ifeq ($(CC),)
CC := gcc
CC_VENDOR := gcc
endif
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99
CPICFLAGS := -fPIC
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS :=
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS :=
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS :=
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/old/pnacl/ 0000775 0000000 0000000 00000000000 14634250137 0020747 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/old/pnacl/bli_kernel.h 0000664 0000000 0000000 00000017215 14634250137 0023234 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
/*
* SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible.
* Therefore, if compilation targets an older Chrome release, we use scalar kernels.
* The target Chrome version is indicated by PPAPI_MACRO defined in the header below.
*/
#include
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
// -- Cache blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MC_S 256
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 8192
#else
#define BLIS_DEFAULT_MC_S 252
#define BLIS_DEFAULT_KC_S 264
#define BLIS_DEFAULT_NC_S 8196
#endif
#define BLIS_DEFAULT_MC_D 1080
#define BLIS_DEFAULT_KC_D 120
#define BLIS_DEFAULT_NC_D 8400
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MC_C 128
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#else
#define BLIS_DEFAULT_MC_C 120
#define BLIS_DEFAULT_KC_C 264
#define BLIS_DEFAULT_NC_C 4092
#endif
#define BLIS_DEFAULT_MC_Z 60
#define BLIS_DEFAULT_KC_Z 264
#define BLIS_DEFAULT_NC_Z 2040
// -- Register blocksizes --
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#else
#define BLIS_DEFAULT_MR_S 4
#define BLIS_DEFAULT_NR_S 3
#endif
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 3
#if PPAPI_RELEASE >= 36
#define BLIS_DEFAULT_MR_C 4
#define BLIS_DEFAULT_NR_C 4
#else
#define BLIS_DEFAULT_MR_C 2
#define BLIS_DEFAULT_NR_C 3
#endif
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 3
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Maximum cache blocksizes (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// larger than the default blocksizes, blocksizes used at edge cases are
// enlarged if such an extension would encompass the remaining portion of
// the matrix dimension.
//#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
//#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
//#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + BLIS_DEFAULT_NC_S/4)
//#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
//#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
//#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + BLIS_DEFAULT_NC_D/4)
//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
// -- Packing register blocksize (for packed micro-panels) --
// NOTE: These register blocksize "extensions" determine whether the
// leading dimensions used within the packed micro-panels are equal to
// or greater than their corresponding register blocksizes above.
//#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + ...)
//#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...)
//#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + ...)
//#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...)
//#define BLIS_PACKDIM_MR_C (BLIS_DEFAULT_MR_C + ...)
//#define BLIS_PACKDIM_NR_C (BLIS_DEFAULT_NR_C + ...)
//#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...)
//#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...)
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- LEVEL-3 KERNEL DEFINITIONS -----------------------------------------------
// -- gemm --
#if PPAPI_RELEASE >= 36
#define BLIS_SGEMM_UKERNEL bli_sgemm_opt
#define BLIS_CGEMM_UKERNEL bli_cgemm_opt
#endif
// -- trsm-related --
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
// -- dotaxpyv --
// -- axpyf --
// -- dotxf --
// -- dotxaxpyf --
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- addv --
// -- axpyv --
#if PPAPI_RELEASE >= 36
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt
#define BLIS_CAXPYV_KERNEL bli_caxpyv_opt
#endif
// -- copyv --
// -- dotv --
#define BLIS_SDOTV_KERNEL bli_sdotv_opt
#define BLIS_DDOTV_KERNEL bli_ddotv_opt
#define BLIS_CDOTV_KERNEL bli_cdotv_opt
#define BLIS_ZDOTV_KERNEL bli_zdotv_opt
// -- dotxv --
// -- invertv --
// -- scal2v --
// -- scalv --
// -- setv --
// -- subv --
// -- swapv --
#endif
cython-blis-1.0.0/blis/_src/config/old/pnacl/make_defs.mk 0000664 0000000 0000000 00000005641 14634250137 0023224 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Only include this block of code once.
ifndef MAKE_DEFS_MK_INCLUDED
MAKE_DEFS_MK_INCLUDED := yes
#
# --- Development tools definitions --------------------------------------------
#
# --- Determine the C compiler and related flags ---
CC := pnacl-clang
CC_VENDOR := pnacl-clang
# Enable IEEE Standard 1003.1-2004 (POSIX.1d).
# NOTE: This is needed to enable posix_memalign().
CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include
CPICFLAGS :=
CDBGFLAGS := -g
CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors
COPTFLAGS := -O3
CKOPTFLAGS := $(COPTFLAGS) -ffast-math
CKVECFLAGS :=
# --- Determine the archiver and related flags ---
AR := pnacl-ar
ARFLAGS := rcs
# --- Determine the linker and related flags ---
LINKER := $(CC)
SOFLAGS :=
ifneq ($(CC_VENDOR),icc)
LDFLAGS := -lm
endif
# --- Determine the finalizer and related flags ---
FINALIZER := pnacl-finalize
FINFLAGS :=
# --- Determine the translator and related flags ---
TRANSLATOR := pnacl-translate
TRNSFLAGS := -O3
TRNSAMD64FLAGS := -arch x86-64
TRNSX86FLAGS := -arch i686
TRNSARMFLAGS := -arch armv7
# end of ifndef MAKE_DEFS_MK_INCLUDED conditional block
endif
cython-blis-1.0.0/blis/_src/config/penryn/ 0000775 0000000 0000000 00000000000 14634250137 0020407 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/penryn/bli_cntx_init_penryn.c 0000664 0000000 0000000 00000006511 14634250137 0024776 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_penryn( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_penryn_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE,
//BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE,
//BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 384, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 384, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/penryn/bli_family_penryn.h 0000664 0000000 0000000 00000006132 14634250137 0024264 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 4
#define BLIS_DEFAULT_MC_S 768
#define BLIS_DEFAULT_KC_S 384
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4
#define BLIS_DEFAULT_MR_D 4
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MC_D 384
#define BLIS_DEFAULT_KC_D 384
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4
#define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
#define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1
#define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1
#define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1
#define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1
#define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/penryn/make_defs.mk 0000664 0000000 0000000 00000006123 14634250137 0022660 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := penryn
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/piledriver/ 0000775 0000000 0000000 00000000000 14634250137 0021241 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/piledriver/bli_cntx_init_piledriver.c 0000664 0000000 0000000 00000006305 14634250137 0026463 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_piledriver( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_piledriver_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 3, 3, 2, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 2016, 1008, 512, 400 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 128, 128, 256, 160 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8400, 8400, 8400, 8400 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/piledriver/bli_family_piledriver.h 0000664 0000000 0000000 00000005717 14634250137 0025760 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3
#define BLIS_DEFAULT_MC_S 2016
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_NC_S 8400
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 3
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3
#define BLIS_DEFAULT_MC_D 1008
#define BLIS_DEFAULT_KC_D 128
#define BLIS_DEFAULT_NC_D 8400
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 3
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2
#define BLIS_DEFAULT_MC_C 512
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 8400
#define BLIS_DEFAULT_MR_C 4
#define BLIS_DEFAULT_NR_C 2
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2
#define BLIS_DEFAULT_MC_Z 400
#define BLIS_DEFAULT_KC_Z 160
#define BLIS_DEFAULT_NC_Z 8400
#define BLIS_DEFAULT_MR_Z 2
#define BLIS_DEFAULT_NR_Z 2
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/piledriver/make_defs.mk 0000664 0000000 0000000 00000006151 14634250137 0023513 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := piledriver
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/power10/ 0000775 0000000 0000000 00000000000 14634250137 0020371 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/power10/bli_cntx_init_power10.c 0000664 0000000 0000000 00000013730 14634250137 0024743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref )
PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref )
// Instantiate prototypes for level-3 kernels.
GEMM_UKR_PROT( float, s, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref )
TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref )
GEMM_UKR_PROT( double, d, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref )
TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref )
GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref )
GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref )
void bli_cntx_init_power10( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_power10_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
12,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE,
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE,
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE,
cntx
);
// Update the context with customized virtual [gemm]trsm micro-kernels.
bli_cntx_set_l3_vir_ukrs
(
8,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref,
cntx
);
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 832, 320, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1026, 960, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/power10/bli_family_power10.h 0000664 0000000 0000000 00000003456 14634250137 0024236 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 192
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 152
cython-blis-1.0.0/blis/_src/config/power10/make_defs.mk 0000664 0000000 0000000 00000005470 14634250137 0022646 0 ustar 00root root 0000000 0000000
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2019, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := power10
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=power10 -mtune=power10
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=power10 -mtune=power10
else
$(info $(CC_VENDOR))
$(error gcc, clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/power7/ 0000775 0000000 0000000 00000000000 14634250137 0020317 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/power7/bli_cntx_init_power7.c 0000664 0000000 0000000 00000005744 14634250137 0024625 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_power7( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_power7_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
1,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/power7/bli_family_power7.h 0000664 0000000 0000000 00000004062 14634250137 0024104 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 4
#define BLIS_DEFAULT_MC_D 64
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4096
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/power7/make_defs.mk 0000664 0000000 0000000 00000005655 14634250137 0022601 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := power7
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS := -mcpu=power7
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -mtune=power7
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mvsx
else
$(error gcc is required for this configuration.)
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/power9/ 0000775 0000000 0000000 00000000000 14634250137 0020321 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/power9/bli_cntx_init_power9.c 0000664 0000000 0000000 00000013650 14634250137 0024624 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Instantiate prototypes for packm kernels.
PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref )
PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref )
// Instantiate prototypes for level-3 kernels.
GEMM_UKR_PROT( float, s, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref )
TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref )
GEMM_UKR_PROT( double, d, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref )
TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref )
GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref )
TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref )
GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref )
GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref )
TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref )
void bli_cntx_init_power9( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_power9_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
12,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE,
BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE,
cntx
);
// Update the context with customized virtual [gemm]trsm micro-kernels.
bli_cntx_set_l3_vir_ukrs
(
8,
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
cntx
);
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref,
cntx
);
bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1,
-1, 12, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/power9/bli_family_power9.h 0000664 0000000 0000000 00000004051 14634250137 0024106 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096
#define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096
#define BLIS_POOL_ADDR_OFFSET_SIZE_A 192
#define BLIS_POOL_ADDR_OFFSET_SIZE_B 152
// Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of
// elements within the packed matrix B.
#define BLIS_DISABLE_HEMM_RIGHT
#define BLIS_DISABLE_SYMM_RIGHT
#define BLIS_DISABLE_TRMM_RIGHT
#define BLIS_DISABLE_TRMM3_RIGHT
cython-blis-1.0.0/blis/_src/config/power9/make_defs.mk 0000664 0000000 0000000 00000005474 14634250137 0022602 0 ustar 00root root 0000000 0000000
#
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2019, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := power9
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=power9 -mtune=power9 -DXLC=0
else
ifeq ($(CC_VENDOR),IBM)
CKVECFLAGS := -qarch=pwr9 -qtune=pwr9 -DXLC=1
else
$(info $(CC_VENDOR))
$(error gcc/xlc is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/sandybridge/ 0000775 0000000 0000000 00000000000 14634250137 0021367 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/sandybridge/bli_cntx_init_sandybridge.c 0000664 0000000 0000000 00000006307 14634250137 0026741 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_sandybridge( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_sandybridge_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 4, 4, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 128, 96, 96, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 256, 256, 192 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/sandybridge/bli_family_sandybridge.h 0000664 0000000 0000000 00000005374 14634250137 0026233 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8
#define BLIS_DEFAULT_MC_S 128
#define BLIS_DEFAULT_KC_S 384
#define BLIS_DEFAULT_NC_S 4096
#define BLIS_DEFAULT_MR_S 8
#define BLIS_DEFAULT_NR_S 8
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4
#define BLIS_DEFAULT_MC_D 96
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4096
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 4
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4
#define BLIS_DEFAULT_MC_C 96
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4096
#define BLIS_DEFAULT_MR_C 8
#define BLIS_DEFAULT_NR_C 4
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4
#define BLIS_DEFAULT_MC_Z 64
#define BLIS_DEFAULT_KC_Z 192
#define BLIS_DEFAULT_NC_Z 4096
#define BLIS_DEFAULT_MR_Z 4
#define BLIS_DEFAULT_NR_Z 4
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/sandybridge/make_defs.mk 0000664 0000000 0000000 00000006376 14634250137 0023652 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := sandybridge
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge
ifeq ($(GCC_OT_4_9_0),yes)
# If gcc is older than 4.9.0, we must use a different label for -march.
CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx
endif
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xAVX
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/skx/ 0000775 0000000 0000000 00000000000 14634250137 0017701 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/skx/bli_cntx_init_skx.c 0000664 0000000 0000000 00000011250 14634250137 0023556 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_skx( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_skx_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
10,
#if 1
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
#endif
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, -1, -1 );
bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 256, -1, -1,
480, 320, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3752, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/skx/bli_family_skx.h 0000664 0000000 0000000 00000011317 14634250137 0023051 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- THREADING PARAMETERS -----------------------------------------------------
#define BLIS_THREAD_RATIO_M 3
#define BLIS_THREAD_RATIO_N 2
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 4
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 64
#define BLIS_SIMD_MAX_SIZE 64
#define BLIS_SIMD_MAX_NUM_REGISTERS 32
//#include
//#define BLIS_MALLOC_POOL malloc
//#define BLIS_FREE_POOL free
#if 0
// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
// -- Cache and register blocksizes --
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
#define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2
#define BLIS_DEFAULT_MC_D 144
#define BLIS_DEFAULT_KC_D 336
#define BLIS_DEFAULT_NC_D 5760
#define BLIS_DEFAULT_MR_D 16
#define BLIS_DEFAULT_NR_D 12
#define BLIS_PACKDIM_MR_D 16
#define BLIS_PACKDIM_NR_D 12
// NOTE: If the micro-kernel, which is typically unrolled to a factor
// of f, handles leftover edge cases (ie: when k % f > 0) then these
// register blocksizes in the k dimension can be defined to 1.
//#define BLIS_DEFAULT_KR_S 1
//#define BLIS_DEFAULT_KR_D 1
//#define BLIS_DEFAULT_KR_C 1
//#define BLIS_DEFAULT_KR_Z 1
// -- Maximum cache blocksizes (for optimizing edge cases) --
// NOTE: These cache blocksize "extensions" have the same constraints as
// the corresponding default blocksizes above. When these values are
// larger than the default blocksizes, blocksizes used at edge cases are
// enlarged if such an extension would encompass the remaining portion of
// the matrix dimension.
#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4)
#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4)
#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0)
#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4)
#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4)
#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0)
//#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4)
//#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4)
//#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4)
//#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4)
//#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4)
//#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4)
#endif
//#endif
cython-blis-1.0.0/blis/_src/config/skx/make_defs.mk 0000664 0000000 0000000 00000011276 14634250137 0022157 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := skx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -O3 -fomit-frame-pointer
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xCORE-AVX512
else
ifeq ($(CC_VENDOR),clang)
# NOTE: We have to use -march=haswell on Windows because apparently AVX512
# uses an alternate calling convention where xmm registers are not callee-saved
# on the stack. When this is mixed with framework code compiled for general
# x86_64 mode then chaos ensues (e.g. #514).
ifeq ($(IS_WIN),yes)
CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=haswell
else
CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512
endif
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# The assembler on OS X won't recognize AVX512 without help
ifneq ($(CC_VENDOR),icc)
ifeq ($(OS_NAME),Darwin)
CKVECFLAGS += -Wa,-march=skylake-avx512
endif
endif
# Flags specific to reference kernels.
# Note: We use AVX2 for reference kernels because, as Jeff Hammond says,
# reference kernel code "is not going to achieve high enough SIMD utilization
# to overcome the AVX-512 frequency drop". (Issue #187)
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),icc)
CRVECFLAGS := -xCORE-AVX2
else
ifeq ($(CC_VENDOR),clang)
# NOTE: We have to use -march=haswell on Windows because apparently AVX512
# uses an alternate calling convention where xmm registers are not callee-saved
# on the stack. When this is mixed with framework code compiled for general
# x86_64 mode then chaos ensues (e.g. #514).
ifeq ($(IS_WIN),yes)
CRVECFLAGS := -march=haswell -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
endif
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/steamroller/ 0000775 0000000 0000000 00000000000 14634250137 0021425 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/steamroller/bli_cntx_init_steamroller.c 0000664 0000000 0000000 00000006307 14634250137 0027035 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_steamroller( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_steamroller_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
4,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 4, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 3, 3, 2, 2 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 2016, 1008, 512, 400 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 128, 128, 256, 160 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8400, 8400, 8400, 8400 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/steamroller/bli_family_steamroller.h 0000664 0000000 0000000 00000003505 14634250137 0026321 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
//#endif
cython-blis-1.0.0/blis/_src/config/steamroller/make_defs.mk 0000664 0000000 0000000 00000006152 14634250137 0023700 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := steamroller
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/template/ 0000775 0000000 0000000 00000000000 14634250137 0020707 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/template/bli_cntx_init_template.c 0000664 0000000 0000000 00000007657 14634250137 0025612 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_template( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_template_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
5,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE,
BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE,
BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE,
BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE,
BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE,
cntx
);
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 0, 0, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 0, 0, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 0, 0, 128 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 0, 0, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 0, 0, 4096 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/template/bli_family_template.h 0000664 0000000 0000000 00000003307 14634250137 0025065 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/template/kernels/ 0000775 0000000 0000000 00000000000 14634250137 0022352 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/template/kernels/1/ 0000775 0000000 0000000 00000000000 14634250137 0022512 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/template/kernels/1/bli_axpyv_template_noopt_var1.c 0000664 0000000 0000000 00000015410 14634250137 0030717 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zaxpyv_template_noopt
(
conj_t conjx,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
/*
Template axpyv kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs a vector scale and accumulate (axpy) operation:
y := y + alpha * conjx( x )
where x and y are vectors of length n and alpha is a scalar.
Parameters:
- conjx: Compute with conjugated values of x?
- n: The number of elements in vectors x and y.
- alpha: The address of a scalar.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides incx or incy is non-unit.
- Vectors x and y are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE.)
Additional things to consider:
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
bool use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y;
dim_t i;
if ( bli_zero_dim1( n ) ) return;
if ( bli_zeq0( *alpha ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc2( incx, incy ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zaxpyv_ft f = bli_zaxpyv_template_ref;
f
(
conjx,
n,
alpha,
x, incx,
y, incy,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x and y.
xp = x;
yp = y;
// Iterate over elements of x and y to compute:
// y += alpha * conjx( x );
if ( bli_is_noconj( conjx ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zaxpys( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zaxpys( *alpha, *xp, *yp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zaxpys( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
}
else // if ( bli_is_conj( conjx ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zaxpyjs( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zaxpyjs( *alpha, *xp, *yp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zaxpyjs( *alpha, *xp, *yp );
xp += 1; yp += 1;
}
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/1/bli_dotv_template_noopt_var1.c 0000664 0000000 0000000 00000017250 14634250137 0030530 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zdotv_template_noopt
(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho,
cntx_t* restrict cntx
)
{
/*
Template dotv kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs an inner (dot) product operation:
rho := conjx( x^T ) * conjy( y )
where x and y are vectors of length n and rho is a scalar.
Parameters:
- conjx: Compute with conjugated values of x?
- conjy: Compute with conjugated values of y?
- n: The number of elements in vectors x and y.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
- rho: The address of the output scalar.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides incx or incy is non-unit.
- Vectors x and y are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE.)
Additional things to consider:
- While four combinations of possible values of conjx and conjy exist, we
implement only conjugation on x explicitly; we induce the other two cases
by toggling the effective conjugation on x and then conjugating the dot
product result.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
dcomplex dotxy;
bool use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y;
dim_t i;
conj_t conjx_use;
// If the vector lengths are zero, set rho to zero and return.
if ( bli_zero_dim1( n ) )
{
bli_zset0s( *rho );
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc2( incx, incy ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zdotv_ft f = bli_zdotv_template_ref;
f
(
conjx,
conjy,
n,
x, incx,
y, incy,
rho,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x and y.
xp = x;
yp = y;
// Initialize accumulator to zero.
bli_zset0s( dotxy );
conjx_use = conjx;
// If y must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of x and then conjugating the
// resulting dot product.
if ( bli_is_conj( conjy ) )
bli_toggle_conj( &conjx_use );
// Iterate over elements of x and y to compute:
// rho = conjx( x^T ) * conjy( y );
if ( bli_is_noconj( conjx_use ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zdots( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zdots( *xp, *yp, dotxy );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zdots( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
}
else // if ( bli_is_conj( conjx_use ) )
{
// Compute front edge cases if x and y were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
// The bulk of the operation is executed here. The addresses xp and
// yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
xp += 1; yp += 1;
}
}
// If conjugation on y was requested, we induce it by conjugating
// the contents of dotxy.
if ( bli_is_conj( conjy ) )
bli_zconjs( dotxy );
bli_zcopys( dotxy, *rho );
}
cython-blis-1.0.0/blis/_src/config/template/kernels/1f/ 0000775 0000000 0000000 00000000000 14634250137 0022660 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c 0000664 0000000 0000000 00000024066 14634250137 0031156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zaxpy2v_template_noopt
(
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha1,
dcomplex* restrict alpha2,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz,
cntx_t* restrict cntx
)
{
/*
Template axpy2v kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel fuses two axpyv operations:
z := z + alpha1 * conjx( x )
z := z + alpha2 * conjy( y )
where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
Parameters:
- conjx: Compute with conjugated values of x?
- conjy: Compute with conjugated values of y?
- n: The number of elements in vectors x, y, and z.
- alpha1: The address of the scalar to be applied to x.
- alpha2: The address of the scalar to be applied to y.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
- z: The address of vector z.
- incz: The vector increment of z. incz should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Any of the strides incx, incy, or incz is non-unit.
- Vectors x, y, and z are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment in the main loops to be
BLIS_SIMD_ALIGN_SIZE.)
Here are a few additional things to consider:
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
dcomplex* zp;
bool use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y, off_z;
dim_t i;
// Return early if possible.
if ( bli_zero_dim1( n ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc3( incx, incy, incz ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y && off_x == off_z )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zaxpy2v_ft f = bli_zaxpy2v_template_ref;
f
(
conjx,
conjy,
n,
alpha1,
alpha2,
x, incx,
y, incy,
z, incz,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x, y, and z.
xp = x;
yp = y;
zp = z;
// Iterate over rows of x, y, and z to compute:
// z += alpha1 * conjx( x ) + alpha2 * conjy( y );
if ( bli_is_noconj( conjx ) && bli_is_noconj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zaxpys( *alpha1, *xp, *zp );
bli_zaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zaxpys( *alpha1, *xp, *zp );
bli_zaxpys( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zaxpys( *alpha1, *xp, *zp );
bli_zaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_noconj( conjx ) && bli_is_conj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zaxpys( *alpha1, *xp, *zp );
bli_zaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zaxpys( *alpha1, *xp, *zp );
bli_zaxpyjs( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zaxpys( *alpha1, *xp, *zp );
bli_zaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_conj( conjx ) && bli_is_noconj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zaxpyjs( *alpha1, *xp, *zp );
bli_zaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zaxpyjs( *alpha1, *xp, *zp );
bli_zaxpys( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zaxpyjs( *alpha1, *xp, *zp );
bli_zaxpys( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else // if ( bli_is_conj( conjx ) && bli_is_conj( conjy ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zaxpyjs( *alpha1, *xp, *zp );
bli_zaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha1 and alpha2 should be loaded once prior to the n_iter
// loop and the elements of z should be loaded and stored only once
// each. The addresses xp, yp, and zp are guaranteed to be aligned
// to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zaxpyjs( *alpha1, *xp, *zp );
bli_zaxpyjs( *alpha2, *yp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zaxpyjs( *alpha1, *xp, *zp );
bli_zaxpyjs( *alpha2, *yp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c 0000664 0000000 0000000 00000022343 14634250137 0031050 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zaxpyf_template_noopt
(
conj_t conja,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
/*
Template axpyf kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs the following gemv-like operation:
y := y + alpha * conja( A ) * conjx( x )
where A is an m x b_n matrix, x is a vector of length b_n, y is a vector
of length m, and alpha is a scalar. The operation is performed as a series
of fused axpyv operations, and therefore A should be column-stored.
Parameters:
- conja: Compute with conjugated values of A?
- conjx: Compute with conjugated values of x?
- m: The number of rows in matrix A.
- b_n: The number of columns in matrix A. Must be equal to or less than
the fusing factor.
- alpha: The address of a scalar.
- a: The address of matrix A.
- inca: The row stride of A. inca should be unit unless the
implementation makes special accomodation for non-unit values.
- lda: The column stride of A.
- x: The address of vector x.
- incx: The vector increment of x.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides inca or incy is non-unit.
- The address of A, the second column of A, and y are unaligned with
different offsets.
If the first/second columns of A and address of y are aligned, or unaligned
by the same offset, then optimized code can be used for the bulk of the
computation. This template shows how the front-edge case can be handled so
that the remaining computation is aligned. (This template guarantees
alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.)
Additional things to consider:
- When optimizing, you should fully unroll the loops over b_n. This is the
dimension across which we are fusing axpyv operations.
- This template code chooses to call the reference implementation whenever
b_n is less than the fusing factor, so as to avoid having to handle edge
cases. One may choose to optimize this edge case, if desired.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *a );
dcomplex* ap[ bli_zaxpyf_fusefac ];
dcomplex* xp[ bli_zaxpyf_fusefac ];
dcomplex* yp;
dcomplex alpha_x[ bli_zaxpyf_fusefac ];
bool use_ref = FALSE;
dim_t m_pre = 0;
dim_t m_iter;
dim_t m_left;
dim_t off_a, off_a2, off_y;
dim_t i, j;
// Return early if possible.
if ( bli_zero_dim2( m, b_n ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < bli_zaxpyf_fusefac )
{
use_ref = TRUE;
}
else if ( bli_has_nonunit_inc3( inca, incx, incy ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and y are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
if ( off_a == off_y && off_a == off_a2 )
{
use_ref = FALSE;
m_pre = off_a / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zaxpyf_ft f = bli_zaxpyf_template_ref;
f
(
conja,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
y, incy,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
m_iter = ( m - m_pre ) / n_elem_per_iter;
m_left = ( m - m_pre ) % n_elem_per_iter;
// Initialize pointers into the columns of A and elements of x.
for ( j = 0; j < b_n; ++j )
{
ap[ j ] = a + (j )*lda;
xp[ j ] = x + (j )*incx;
}
yp = y;
// Load elements of x or conj(x) into alpha_x and scale by alpha.
if ( bli_is_noconj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zcopys( *xp[ j ], alpha_x[ j ] );
bli_zscals( *alpha, alpha_x[ j ] );
}
}
else // if ( bli_is_conj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zcopyjs( *xp[ j ], alpha_x[ j ] );
bli_zscals( *alpha, alpha_x[ j ] );
}
}
// Iterate over rows of A and y to compute:
// y += conja( A )*conjx( x );
if ( bli_is_noconj( conja ) )
{
// Compute front edge cases if a and y were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, and the b_n loop should be fully unrolled. The addresses in
// ap[] and yp are guaranteed to be aligned to
// BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += n_elem_per_iter;
}
yp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
}
else // if ( bli_is_conj( conja ) )
{
// Compute front edge cases if a and y were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, and the b_n loop should be fully unrolled. The addresses in
// ap[] and yp are guaranteed to be aligned to
// BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += n_elem_per_iter;
}
yp += n_elem_per_iter;
}
// Compute tail edge cases.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp );
ap[ j ] += 1;
}
yp += 1;
}
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c 0000664 0000000 0000000 00000026040 14634250137 0031575 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zdotaxpyv_template_noopt
(
conj_t conjxt,
conj_t conjx,
conj_t conjy,
dim_t n,
dcomplex* restrict alpha,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict rho,
dcomplex* restrict z, inc_t incz,
cntx_t* restrict cntx
)
{
/*
Template dotaxpyv kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel fuses a dotv and axpyv operation:
rho := conjxt( x^T ) * conjy( y )
z := z + alpha * conjx( x )
where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars.
Parameters:
- conjxt: Compute with conjugated values of x^T?
- conjx: Compute with conjugated values of x?
- conjy: Compute with conjugated values of y?
- n: The number of elements in vectors x, y, and z.
- alpha: The address of the scalar to be applied to x.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- y: The address of vector y.
- incy: The vector increment of y. incy should be unit unless the
implementation makes special accomodation for non-unit values.
- rho: The address of the output scalar of the dotv subproblem.
- z: The address of vector z.
- incz: The vector increment of z. incz should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Any of the strides incx, incy, or incz is non-unit.
- Vectors x, y, and z are unaligned with different offsets.
If the vectors are aligned, or unaligned by the same offset, then optimized
code can be used for the bulk of the computation. This template shows how
the front-edge case can be handled so that the remaining computation is
aligned. (This template guarantees alignment in the main loops to be
BLIS_SIMD_ALIGN_SIZE.)
Here are a few additional things to consider:
- While four combinations of possible values of conjx and conjy exist, we
implement only conjugation on x explicitly; we induce the other two cases
by toggling the effective conjugation on x and then conjugating the dot
product result.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* xp;
dcomplex* yp;
dcomplex* zp;
dcomplex dotxy;
bool use_ref = FALSE;
dim_t n_pre = 0;
dim_t n_iter;
dim_t n_left;
dim_t off_x, off_y, off_z;
dim_t i;
conj_t conjxt_use;
// If the vector lengths are zero, set rho to zero and return.
if ( bli_zero_dim1( n ) )
{
bli_zset0s( *rho );
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( bli_has_nonunit_inc3( incx, incy, incz ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If x, y, and z are unaligned by the same offset, then we can
// still use an implementation that depends on alignment for most
// of the operation.
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE );
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
if ( off_x == off_y && off_x == off_z )
{
use_ref = FALSE;
n_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zdotaxpyv_ft f = bli_zdotaxpyv_template_ref;
f
(
conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
n_iter = ( n - n_pre ) / n_elem_per_iter;
n_left = ( n - n_pre ) % n_elem_per_iter;
// Initialize pointers into x, y, and z.
xp = x;
yp = y;
zp = z;
// Initialize accumulator to zero.
bli_zset0s( dotxy );
conjxt_use = conjxt;
// If y must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of xt and then conjugating the
// resulting dot product.
if ( bli_is_conj( conjy ) )
bli_toggle_conj( &conjxt_use );
// Iterate over elements of x, y, and z to compute:
// r = conjxt( x^T ) * conjy( y );
// z += alpha * conjx( x );
if ( bli_is_noconj( conjx ) && bli_is_noconj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zdots( *xp, *yp, dotxy );
bli_zaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zdots( *xp, *yp, dotxy );
bli_zaxpys( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zdots( *xp, *yp, dotxy );
bli_zaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_noconj( conjx ) && bli_is_conj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
bli_zaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
bli_zaxpys( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
bli_zaxpys( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else if ( bli_is_conj( conjx ) && bli_is_noconj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zdots( *xp, *yp, dotxy );
bli_zaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zdots( *xp, *yp, dotxy );
bli_zaxpyjs( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zdots( *xp, *yp, dotxy );
bli_zaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
else // if ( bli_is_conj( conjx ) && bli_is_conj( conjxt_use ) )
{
// Compute front edge cases if x, y, and z were unaligned.
for ( i = 0; i < n_pre; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
bli_zaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// alpha should be loaded once prior to the n_iter loop, dotxy
// should be and kept in registers, and each element of x should be
// loaded only once each. The addresses xp, yp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < n_iter; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
bli_zaxpyjs( *alpha, *xp, *zp );
xp += n_elem_per_iter;
yp += n_elem_per_iter;
zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < n_left; ++i )
{
bli_zdotjs( *xp, *yp, dotxy );
bli_zaxpyjs( *alpha, *xp, *zp );
xp += 1; yp += 1; zp += 1;
}
}
// If conjugation on y was requested, we induce it by conjugating
// the contents of rho.
if ( bli_is_conj( conjy ) )
bli_zconjs( dotxy );
bli_zcopys( dotxy, *rho );
}
cython-blis-1.0.0/blis/_src/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c 0000664 0000000 0000000 00000034000 14634250137 0031740 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zdotxaxpyf_template_noopt
(
conj_t conjat,
conj_t conja,
conj_t conjw,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict w, inc_t incw,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy,
dcomplex* restrict z, inc_t incz,
cntx_t* restrict cntx
)
{
/*
Template dotxaxpyf kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs the following two gemv-like operations:
y := beta * y + alpha * conjat( A^T ) * conjw( w )
z := z + alpha * conja( A ) * conjx( x )
where A is an m x b_n matrix, x and y are vector of length b_n, w and z
are vectors of length m, and alpha and beta are scalars. The operation
fuses a dotxf and an axpyf operation, and therefore A should be column-
stored.
Parameters:
- conjat: Compute with conjugated values of A^T?
- conja: Compute with conjugated values of A?
- conjw: Compute with conjugated values of w?
- conjx: Compute with conjugated values of x?
- m: The number of rows in matrix A.
- b_n: The number of columns in matrix A. Must be equal to or less than
the fusing factor.
- alpha: The address of the scalar to be applied to A^T*w and A*x.
- a: The address of matrix A.
- inca: The row stride of A. inca should be unit unless the
implementation makes special accomodation for non-unit values.
- lda: The column stride of A.
- w: The address of vector w.
- incw: The vector increment of w. incw should be unit unless the
implementation makes special accomodation for non-unit values.
- x: The address of vector x.
- incx: The vector increment of x.
- beta: The address of the scalar to be applied to y.
- y: The address of vector y.
- incy: The vector increment of y.
- z: The address of vector z.
- incz: The vector increment of z. incz should be unit unless the
implementation makes special accomodation for non-unit values.
This template code calls the reference implementation if any of the
following conditions are true:
- Any of the strides inca, incw, or incz is non-unit.
- The address of A, the second column of A, w, and z are unaligned with
different offsets.
If the first/second rows of A and addresses of w and z are aligned, or
unaligned by the same offset, then optimized code can be used for the bulk
of the computation. This template shows how the front-edge case can be
handled so that the remaining computation is aligned. (This template
guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.)
Additional things to consider:
- When optimizing, you should fully unroll the loops over b_n. This is the
dimension across which we are fusing dotxv operations.
- This template code chooses to call the reference implementation whenever
b_n is less than the fusing factor, so as to avoid having to handle edge
cases. One may choose to optimize this edge case, if desired.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *a );
dcomplex* ap[ bli_zdotxaxpyf_fusefac ];
dcomplex* xp[ bli_zdotxaxpyf_fusefac ];
dcomplex* yp[ bli_zdotxaxpyf_fusefac ];
dcomplex* wp;
dcomplex* zp;
dcomplex At_w[ bli_zdotxaxpyf_fusefac ];
dcomplex alpha_x[ bli_zdotxaxpyf_fusefac ];
bool use_ref = FALSE;
dim_t m_pre = 0;
dim_t m_iter;
dim_t m_left;
dim_t off_a, off_a2, off_w, off_z;
dim_t i, j;
conj_t conjat_use;
// Return early if possible.
if ( bli_zero_dim2( m, b_n ) ) return;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < bli_zdotxaxpyf_fusefac )
{
use_ref = TRUE;
}
else if ( bli_has_nonunit_inc3( inca, incw, incz ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( w, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, w, and z are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
off_w = bli_offset_from_alignment( w, BLIS_SIMD_ALIGN_SIZE );
off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE );
if ( off_a == off_a2 && off_a == off_w && off_a == off_z )
{
use_ref = FALSE;
m_pre = off_a / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zdotxaxpyf_ft f = bli_zdotxaxpyf_template_ref;
f
(
conjat,
conja,
conjw,
conjx,
m,
b_n,
alpha,
a, inca, lda,
w, incw,
x, incx,
beta,
y, incy,
z, incz,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
m_iter = ( m - m_pre ) / n_elem_per_iter;
m_left = ( m - m_pre ) % n_elem_per_iter;
// Initialize pointers into the columns of A and elements of x.
for ( j = 0; j < b_n; ++j )
{
ap[ j ] = a + (j )*lda;
xp[ j ] = x + (j )*incx;
yp[ j ] = y + (j )*incy;
}
wp = w;
zp = z;
// Load elements of x or conj(x) into alpha_x and scale by alpha.
if ( bli_is_noconj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zcopys( *xp[ j ], alpha_x[ j ] );
bli_zscals( *alpha, alpha_x[ j ] );
}
}
else // if ( bli_is_conj( conjx ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zcopyjs( *xp[ j ], alpha_x[ j ] );
bli_zscals( *alpha, alpha_x[ j ] );
}
}
// Initialize our accumulators to zero.
for ( j = 0; j < b_n; ++j )
{
bli_zset0s( At_w[ j ] );
}
conjat_use = conjat;
// If w must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of At and then conjugating the
// resulting dot products.
if ( bli_is_conj( conjw ) )
bli_toggle_conj( &conjat_use );
// Iterate over the columns of A and elements of w and z to compute:
// y = beta * y + alpha * conjat( A^T ) * conjw( w );
// z = z + alpha * conja( A ) * conjx( x );
// where A is m x b_n.
if ( bli_is_noconj( conja ) && bli_is_noconj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdots( *ap[ j ], *wp, At_w[ j ] );
bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdots( *ap[ j ], *wp, At_w[ j ] );
bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdots( *ap[ j ], *wp, At_w[ j ] );
bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
else if ( bli_is_noconj( conja ) && bli_is_conj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zdots( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
else if ( bli_is_conj( conja ) && bli_is_noconj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdots( *ap[ j ], *wp, At_w[ j ] );
bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdots( *ap[ j ], *wp, At_w[ j ] );
bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdots( *ap[ j ], *wp, At_w[ j ] );
bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
else if ( bli_is_conj( conja ) && bli_is_conj( conjat_use ) )
{
// Compute front edge cases if A, w, and z were unaligned.
for ( i = 0; i < m_pre; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of alpha_x should be loaded once prior to the m_iter
// loop, At_w should be kept in registers, and the b_n loop should
// be fully unrolled. The addresses in ap[], wp, and zp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( i = 0; i < m_iter; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += n_elem_per_iter;
}
wp += n_elem_per_iter; zp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( i = 0; i < m_left; ++i )
{
for ( j = 0; j < b_n; ++j )
{
bli_zdotjs( *ap[ j ], *wp, At_w[ j ] );
bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp );
ap[ j ] += 1;
}
wp += 1; zp += 1;
}
}
// If conjugation on w was requested, we induce it by conjugating
// the contents of At_w.
if ( bli_is_conj( conjw ) )
{
for ( j = 0; j < b_n; ++j )
{
bli_zconjs( At_w[ j ] );
}
}
// Scale the At_w product by alpha and accumulate into y after
// scaling by beta.
for ( j = 0; j < b_n; ++j )
{
bli_zscals( *beta, *yp[ j ] );
bli_zaxpys( *alpha, At_w[ j ], *yp[ j ] );
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c 0000664 0000000 0000000 00000023625 14634250137 0031051 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zdotxf_template_noopt
(
conj_t conjat,
conj_t conjx,
dim_t m,
dim_t b_n,
dcomplex* restrict alpha,
dcomplex* restrict a, inc_t inca, inc_t lda,
dcomplex* restrict x, inc_t incx,
dcomplex* restrict beta,
dcomplex* restrict y, inc_t incy,
cntx_t* restrict cntx
)
{
/*
Template dotxf kernel implementation
This function contains a template implementation for a double-precision
complex kernel, coded in C, which can serve as the starting point for one
to write an optimized kernel on an arbitrary architecture. (We show a
template implementation for only double-precision complex because the
templates for the other three floating-point types would be similar, with
the real instantiations being noticeably simpler due to the disappearance
of conjugation in the real domain.)
This kernel performs the following gemv-like operation:
y := beta * y + alpha * conjat( A^T ) * conjx( x )
where A is an m x b_n matrix, x is a vector of length m, y is a vector
of length b_n, and alpha and beta are scalars. The operation is performed
as a series of fused dotxv operations, and therefore A should be column-
stored.
Parameters:
- conjat: Compute with conjugated values of A^T?
- conjx: Compute with conjugated values of x?
- m: The number of rows in matrix A.
- b_n: The number of columns in matrix A. Must be equal to or less than
the fusing factor.
- alpha: The address of the scalar to be applied to A*x.
- a: The address of matrix A.
- inca: The row stride of A. inca should be unit unless the
implementation makes special accomodation for non-unit values.
- lda: The column stride of A.
- x: The address of vector x.
- incx: The vector increment of x. incx should be unit unless the
implementation makes special accomodation for non-unit values.
- beta: The address of the scalar to be applied to y.
- y: The address of vector y.
- incy: The vector increment of y.
This template code calls the reference implementation if any of the
following conditions are true:
- Either of the strides inca or incx is non-unit.
- The address of A, the second column of A, and x are unaligned with
different offsets.
If the first/second columns of A and address of x are aligned, or unaligned
by the same offset, then optimized code can be used for the bulk of the
computation. This template shows how the front-edge case can be handled so
that the remaining computation is aligned. (This template guarantees
alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.)
Additional things to consider:
- When optimizing, you should fully unroll the loops over b_n. This is the
dimension across which we are fusing dotxv operations.
- This template code chooses to call the reference implementation whenever
b_n is less than the fusing factor, so as to avoid having to handle edge
cases. One may choose to optimize this edge case, if desired.
- Because conjugation disappears in the real domain, real instances of
this kernel can safely ignore the values of any conjugation parameters,
thereby simplifying the implementation.
For more info, please refer to the BLIS website and/or contact the
blis-devel mailing list.
-FGVZ
*/
const dim_t n_elem_per_reg = 1;
const dim_t n_iter_unroll = 1;
const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll;
const siz_t type_size = sizeof( *x );
dcomplex* ap[ bli_zdotxf_fusefac ];
dcomplex* xp;
dcomplex* yp[ bli_zdotxf_fusefac ];
dcomplex Atx[ bli_zdotxf_fusefac ];
bool use_ref = FALSE;
dim_t m_pre = 0;
dim_t m_iter;
dim_t m_left;
dim_t off_a, off_a2, off_x;
dim_t i, j;
conj_t conjat_use;
// Return early if possible.
if ( bli_zero_dim1( b_n ) ) return;
// If the vector lengths are zero, scale r by beta and return.
if ( bli_zero_dim1( m ) )
{
bli_zscalv_ex
(
BLIS_NO_CONJUGATE,
b_n,
beta,
y, incy,
cntx
);
return;
}
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < bli_zdotxf_fusefac )
{
use_ref = TRUE;
}
else if ( bli_has_nonunit_inc2( inca, incx ) )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) ||
bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) )
{
use_ref = TRUE;
// If a, the second column of a, and x are unaligned by the same
// offset, then we can still use an implementation that depends on
// alignment for most of the operation.
off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE );
off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE );
off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE );
if ( off_a == off_a2 && off_a == off_x )
{
use_ref = FALSE;
m_pre = off_x / type_size;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
zdotxf_ft f = bli_zdotxf_template_ref;
f
(
conjat,
conjx,
m,
b_n,
alpha,
a, inca, lda,
x, incx,
beta,
y, incy,
cntx
);
return;
}
// Compute the number of unrolled and leftover (edge) iterations.
m_iter = ( m - m_pre ) / n_elem_per_iter;
m_left = ( m - m_pre ) % n_elem_per_iter;
// Initialize pointers into the rows of A and elements of y.
for ( i = 0; i < b_n; ++i )
{
ap[ i ] = a + (i )*lda;
yp[ i ] = y + (i )*incy;
}
xp = x;
// Initialize our accumulators to zero.
for ( i = 0; i < b_n; ++i )
{
bli_zset0s( Atx[ i ] );
}
conjat_use = conjat;
// If x must be conjugated, we compute the result indirectly by first
// toggling the effective conjugation of A and then conjugating the
// resulting product A^T*x.
if ( bli_is_conj( conjx ) )
bli_toggle_conj( &conjat_use );
// Iterate over columns of A and rows of x to compute:
// Atx = conjat_use( A^T ) * x;
if ( bli_is_noconj( conjat_use ) )
{
// Compute front edge cases if A and y were unaligned.
for ( j = 0; j < m_pre; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of Atx should be kept in registers, and the b_n loop
// should be fully unrolled. The addresses in ap[] and xp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( j = 0; j < m_iter; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += n_elem_per_iter;
}
xp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( j = 0; j < m_left; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdots( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
}
else // if ( bli_is_conj( conjat_use ) )
{
// Compute front edge cases if A and y were unaligned.
for ( j = 0; j < m_pre; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
// The bulk of the operation is executed here. For best performance,
// the elements of Atx should be kept in registers, and the b_n loop
// should be fully unrolled. The addresses in ap[] and xp are
// guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE.
for ( j = 0; j < m_iter; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += n_elem_per_iter;
}
xp += n_elem_per_iter;
}
// Compute tail edge cases, if applicable.
for ( j = 0; j < m_left; ++j )
{
for ( i = 0; i < b_n; ++i )
{
bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] );
ap[ i ] += 1;
}
xp += 1;
}
}
// If conjugation on y was requested, we induce it by conjugating
// the contents of Atx.
if ( bli_is_conj( conjx ) )
{
for ( i = 0; i < b_n; ++i )
{
bli_zconjs( Atx[ i ] );
}
}
// Scale the Atx product by alpha and accumulate into y after
// scaling by beta.
for ( i = 0; i < b_n; ++i )
{
bli_zzscals( *beta, *yp[ i ] );
bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] );
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/3/ 0000775 0000000 0000000 00000000000 14634250137 0022514 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/template/kernels/3/bli_gemm_template_noopt_mxn.c 0000664 0000000 0000000 00000011066 14634250137 0030433 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zgemm_template_noopt
(
dim_t m,
dim_t n,
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a1,
dcomplex* restrict b1,
dcomplex* restrict beta,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/*
Template gemm micro-kernel implementation
This function contains a template implementation for a double-precision
complex micro-kernel, coded in C, which can serve as the starting point for
one to write an optimized micro-kernel on an arbitrary architecture. (We
show a template implementation for only double-precision complex because
the templates for the other three floating-point types would be nearly
identical.)
This micro-kernel performs a matrix-matrix multiplication of the form:
C11 := beta * C11 + alpha * A1 * B1
where A1 is MR x k, B1 is k x NR, C11 is MR x NR, and alpha and beta are
scalars.
For more info, please refer to the BLIS website's wiki on kernels:
https://github.com/flame/blis/wiki/KernelsHowTo
and/or contact the blis-devel mailing list.
-FGVZ
*/
const num_t dt = BLIS_DCOMPLEX;
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx );
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
const inc_t cs_a = packmr;
const inc_t rs_b = packnr;
const inc_t rs_ab = 1;
const inc_t cs_ab = mr;
dim_t l, j, i;
dcomplex ab[ mr * nr ];
dcomplex* abij;
dcomplex ai, bj;
/* Initialize the accumulator elements in ab to zero. */
for ( i = 0; i < mr * nr; ++i )
{
bli_zset0s( *(ab + i) );
}
/* Perform a series of k rank-1 updates into ab. */
for ( l = 0; l < k; ++l )
{
abij = ab;
/* In an optimized implementation, these two loops over MR and NR
are typically fully unrolled. */
for ( j = 0; j < nr; ++j )
{
bj = *(b1 + j);
for ( i = 0; i < mr; ++i )
{
ai = *(a1 + i);
bli_zdots( ai, bj, *abij );
abij += rs_ab;
}
}
a1 += cs_a;
b1 += rs_b;
}
/* Scale each element of ab by alpha. */
for ( i = 0; i < mr * nr; ++i )
{
bli_zscals( *alpha, *(ab + i) );
}
/* If beta is zero, overwrite c11 with the scaled result in ab.
Otherwise, scale c11 by beta and then add the scaled result in
ab. */
if ( bli_zeq0( *beta ) )
{
/* c11 := ab */
bli_zcopys_mxn( m,
n,
ab, rs_ab, cs_ab,
c11, rs_c, cs_c );
}
else
{
/* c11 := beta * c11 + ab */
bli_zxpbys_mxn( m,
n,
ab, rs_ab, cs_ab,
beta,
c11, rs_c, cs_c );
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c 0000664 0000000 0000000 00000006576 14634250137 0031666 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zgemmtrsm_l_template_noopt
(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict b01,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/*
Template gemmtrsm_l micro-kernel implementation
This function contains a template implementation for a double-precision
complex micro-kernel that fuses a gemm with a trsm_l subproblem.
This micro-kernel performs the following compound operation:
B11 := alpha * B11 - A10 * B01 (gemm)
B11 := inv(A11) * B11 (trsm)
C11 := B11
where A11 is MR x MR and lower triangular, A10 is MR x k, B01 is k x NR,
B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix
inverse.
For more info, please refer to the BLIS website's wiki on kernels:
https://github.com/flame/blis/wiki/KernelsHowTo
and/or contact the blis-devel mailing list.
-FGVZ
*/
const num_t dt = BLIS_DCOMPLEX;
const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
const inc_t rs_b = packnr;
const inc_t cs_b = 1;
dcomplex* restrict minus_one = bli_zm1;
/* b11 = alpha * b11 - a10 * b01; */
bli_zgemm_template_noopt
(
mr,
nr,
k,
minus_one,
a10,
b01,
alpha,
b11, rs_b, cs_b,
data
);
/* b11 = inv(a11) * b11;
c11 = b11; */
bli_ztrsm_l_template_noopt
(
a11,
b11,
c11, rs_c, cs_c,
data
);
}
cython-blis-1.0.0/blis/_src/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c 0000664 0000000 0000000 00000006575 14634250137 0031676 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_zgemmtrsm_u_template_noopt
(
dim_t k,
dcomplex* restrict alpha,
dcomplex* restrict a10,
dcomplex* restrict a11,
dcomplex* restrict b01,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/*
Template gemmtrsm_u micro-kernel implementation
This function contains a template implementation for a double-precision
complex micro-kernel that fuses a gemm with a trsm_u subproblem.
This micro-kernel performs the following compound operation:
B11 := alpha * B11 - A12 * B21 (gemm)
B11 := inv(A11) * B11 (trsm)
C11 := B11
where A11 is MR x MR and upper triangular, A12 is MR x k, B21 is k x NR,
B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix
inverse.
For more info, please refer to the BLIS website's wiki on kernels:
https://github.com/flame/blis/wiki/KernelsHowTo
and/or contact the blis-devel mailing list.
-FGVZ
*/
const num_t dt = BLIS_DCOMPLEX;
const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
const inc_t rs_b = packnr;
const inc_t cs_b = 1;
dcomplex* restrict minus_one = bli_zm1;
/* b11 = alpha * b11 - a12 * b21; */
bli_zgemm_template_noopt
(
mr,
nr,
k,
minus_one,
a10,
b01,
alpha,
b11, rs_b, cs_b,
data
);
/* b11 = inv(a11) * b11;
c11 = b11; */
bli_ztrsm_u_template_noopt
(
a11,
b11,
c11, rs_c, cs_c,
data
);
}
cython-blis-1.0.0/blis/_src/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c 0000664 0000000 0000000 00000011132 14634250137 0031000 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_ztrsm_l_template_noopt
(
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/*
Template trsm_l micro-kernel implementation
This function contains a template implementation for a double-precision
complex trsm micro-kernel, coded in C, which can serve as the starting point
for one to write an optimized micro-kernel on an arbitrary architecture.
(We show a template implementation for only double-precision complex because
the templates for the other three floating-point types would be nearly
identical.)
This micro-kernel performs the following operation:
C11 := inv(A11) * B11
where A11 is MR x MR and lower triangular, B11 is MR x NR, and C11 is
MR x NR.
For more info, please refer to the BLIS website's wiki on kernels:
https://github.com/flame/blis/wiki/KernelsHowTo
and/or contact the blis-devel mailing list.
-FGVZ
*/
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx );
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
const dim_t m = mr;
const dim_t n = nr;
const inc_t rs_a = 1;
const inc_t cs_a = packmr;
const inc_t rs_b = packnr;
const inc_t cs_b = 1;
dim_t iter, i, j, l;
dim_t n_behind;
dcomplex* restrict alpha11;
dcomplex* restrict a10t;
dcomplex* restrict alpha10;
dcomplex* restrict X0;
dcomplex* restrict x1;
dcomplex* restrict x01;
dcomplex* restrict chi01;
dcomplex* restrict chi11;
dcomplex* restrict gamma11;
dcomplex rho11;
for ( iter = 0; iter < m; ++iter )
{
i = iter;
n_behind = i;
alpha11 = a11 + (i )*rs_a + (i )*cs_a;
a10t = a11 + (i )*rs_a + (0 )*cs_a;
X0 = b11 + (0 )*rs_b + (0 )*cs_b;
x1 = b11 + (i )*rs_b + (0 )*cs_b;
/* x1 = x1 - a10t * X0; */
/* x1 = x1 / alpha11; */
for ( j = 0; j < n; ++j )
{
x01 = X0 + (0 )*rs_b + (j )*cs_b;
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
gamma11 = c11 + (i )*rs_c + (j )*cs_c;
/* chi11 = chi11 - a10t * x01; */
bli_zset0s( rho11 );
for ( l = 0; l < n_behind; ++l )
{
alpha10 = a10t + (l )*cs_a;
chi01 = x01 + (l )*rs_b;
bli_zaxpys( *alpha10, *chi01, rho11 );
}
bli_zsubs( rho11, *chi11 );
/* chi11 = chi11 / alpha11; */
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */
bli_zscals( *alpha11, *chi11 );
/* Output final result to matrix C. */
bli_zcopys( *chi11, *gamma11 );
}
}
}
cython-blis-1.0.0/blis/_src/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c 0000664 0000000 0000000 00000011145 14634250137 0031015 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_ztrsm_u_template_noopt
(
dcomplex* restrict a11,
dcomplex* restrict b11,
dcomplex* restrict c11, inc_t rs_c, inc_t cs_c,
auxinfo_t* restrict data,
cntx_t* restrict cntx
)
{
/*
Template trsm_u micro-kernel implementation
This function contains a template implementation for a double-precision
complex trsm micro-kernel, coded in C, which can serve as the starting point
for one to write an optimized micro-kernel on an arbitrary architecture.
(We show a template implementation for only double-precision complex because
the templates for the other three floating-point types would be nearly
identical.)
This micro-kernel performs the following operation:
C11 := inv(A11) * B11
where A11 is MR x MR and upper triangular, B11 is MR x NR, and C11 is
MR x NR.
For more info, please refer to the BLIS website's wiki on kernels:
https://github.com/flame/blis/wiki/KernelsHowTo
and/or contact the blis-devel mailing list.
-FGVZ
*/
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx );
const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx );
const dim_t m = mr;
const dim_t n = nr;
const inc_t rs_a = 1;
const inc_t cs_a = packmr;
const inc_t rs_b = packnr;
const inc_t cs_b = 1;
dim_t iter, i, j, l;
dim_t n_behind;
dcomplex* restrict alpha11;
dcomplex* restrict a12t;
dcomplex* restrict alpha12;
dcomplex* restrict X2;
dcomplex* restrict x1;
dcomplex* restrict x21;
dcomplex* restrict chi21;
dcomplex* restrict chi11;
dcomplex* restrict gamma11;
dcomplex rho11;
for ( iter = 0; iter < m; ++iter )
{
i = m - iter - 1;
n_behind = iter;
alpha11 = a11 + (i )*rs_a + (i )*cs_a;
a12t = a11 + (i )*rs_a + (i+1)*cs_a;
x1 = b11 + (i )*rs_b + (0 )*cs_b;
X2 = b11 + (i+1)*rs_b + (0 )*cs_b;
/* x1 = x1 - a12t * X2; */
/* x1 = x1 / alpha11; */
for ( j = 0; j < n; ++j )
{
chi11 = x1 + (0 )*rs_b + (j )*cs_b;
x21 = X2 + (0 )*rs_b + (j )*cs_b;
gamma11 = c11 + (i )*rs_c + (j )*cs_c;
/* chi11 = chi11 - a12t * x21; */
bli_zset0s( rho11 );
for ( l = 0; l < n_behind; ++l )
{
alpha12 = a12t + (l )*cs_a;
chi21 = x21 + (l )*rs_b;
bli_zaxpys( *alpha12, *chi21, rho11 );
}
bli_zsubs( rho11, *chi11 );
/* chi11 = chi11 / alpha11; */
/* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead
of alpha11, so we can multiply rather than divide. We store
the inverse of alpha11 intentionally to avoid expensive
division instructions within the micro-kernel. */
bli_zscals( *alpha11, *chi11 );
/* Output final result to matrix C. */
bli_zcopys( *chi11, *gamma11 );
}
}
}
cython-blis-1.0.0/blis/_src/config/template/make_defs.mk 0000664 0000000 0000000 00000005124 14634250137 0023160 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := template
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
CKVECFLAGS :=
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
CRVECFLAGS := $(CKVECFLAGS)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/thunderx2/ 0000775 0000000 0000000 00000000000 14634250137 0021017 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/thunderx2/bli_cntx_init_thunderx2.c 0000664 0000000 0000000 00000006055 14634250137 0026021 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_thunderx2( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
// Set default kernel blocksizes and functions.
bli_cntx_init_thunderx2_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
2,
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 120, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 240, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 5,
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
cntx
);
}
cython-blis-1.0.0/blis/_src/config/thunderx2/bli_family_thunderx2.h 0000664 0000000 0000000 00000003470 14634250137 0025306 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
// -- MEMORY ALLOCATION --------------------------------------------------------
#define BLIS_SIMD_ALIGN_SIZE 16
cython-blis-1.0.0/blis/_src/config/thunderx2/make_defs.mk 0000664 0000000 0000000 00000006043 14634250137 0023271 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := thunderx2
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS := -D_GNU_SOURCE
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -mcpu=thunderx2t99
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mcpu=thunderx2t99
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mcpu=thunderx2t99
else
$(error gcc or clang is required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/x86_64/ 0000775 0000000 0000000 00000000000 14634250137 0020032 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/x86_64/bli_family_x86_64.h 0000664 0000000 0000000 00000003306 14634250137 0023332 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/x86_64/make_defs.mk 0000664 0000000 0000000 00000006122 14634250137 0022302 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := x86_64
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/x86_64_no_skx/ 0000775 0000000 0000000 00000000000 14634250137 0021413 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/x86_64_no_skx/bli_family_x86_64_no_skx.h 0000664 0000000 0000000 00000003306 14634250137 0026274 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/x86_64_no_skx/make_defs.mk 0000664 0000000 0000000 00000006127 14634250137 0023670 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := x86_64_no_skx
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/x86_64_no_zen2/ 0000775 0000000 0000000 00000000000 14634250137 0021464 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/x86_64_no_zen2/bli_family_x86_64_no_zen2.h 0000664 0000000 0000000 00000003306 14634250137 0026416 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/x86_64_no_zen2/make_defs.mk 0000664 0000000 0000000 00000006130 14634250137 0023733 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := x86_64_no_zen2
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/x86_64_no_zen3/ 0000775 0000000 0000000 00000000000 14634250137 0021465 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/x86_64_no_zen3/bli_family_x86_64_no_zen3.h 0000664 0000000 0000000 00000003306 14634250137 0026420 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//#ifndef BLIS_FAMILY_H
//#define BLIS_FAMILY_H
//#endif
cython-blis-1.0.0/blis/_src/config/x86_64_no_zen3/make_defs.mk 0000664 0000000 0000000 00000006130 14634250137 0023734 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2014, The University of Texas at Austin
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := x86_64_no_zen3
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized kernels.
CKOPTFLAGS := $(COPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
ifeq ($(CC_VENDOR),icc)
CKVECFLAGS := -xSSE3
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2
else
$(error gcc, icc, or clang is required for this configuration.)
endif
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/zen/ 0000775 0000000 0000000 00000000000 14634250137 0017670 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/zen/amd_config.mk 0000664 0000000 0000000 00000005710 14634250137 0022312 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2019, Advanced Micro Devices, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# All the common flags for AMD architectures will be added here
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
endif
# Flags specific to optimized kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -O3
ifeq ($(CC_VENDOR),gcc)
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
else
ifeq ($(CC_VENDOR),clang)
CKVECFLAGS := -mavx2 -mfpmath=sse -mfma
ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM')),1)
CKVECFLAGS += -mllvm -disable-licm-vrp
endif
else
$(error gcc or clang are required for this configuration.)
endif
endif
# Flags specific to reference kernels.
CROPTFLAGS := $(CKOPTFLAGS)
ifeq ($(CC_VENDOR),gcc)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
ifeq ($(CC_VENDOR),clang)
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
else
CRVECFLAGS := $(CKVECFLAGS)
endif
endif
cython-blis-1.0.0/blis/_src/config/zen/bli_cntx_init_zen.c 0000664 0000000 0000000 00000027627 14634250137 0023553 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref )
void bli_cntx_init_zen( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
#if 1
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
#if 0
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int,
#else
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
#endif
#if 1
// copyv
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
#endif
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
#if 0
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int,
#else
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
#endif
#if 1
// setv
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
// swapv
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
#endif
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
/*
Multi Instance performance improvement of DGEMM when binded to a CCX
In Multi instance each thread runs a sequential DGEMM.
a) If BLIS is run in a multi-instance mode with
CPU freq 2.6/2.2 Ghz
DDR4 clock frequency 2400Mhz
mc = 240, kc = 512, and nc = 2040
has better performance on EPYC server, over the default block sizes.
b) If BLIS is run in Single Instance mode
mc = 510, kc = 1024 and nc = 4080
*/
#ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES
// Zen optmized level 3 cache block sizes
#if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 );
#endif
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
//BLIS_GEMMT, bli_gemmtsup_ref,
cntx
);
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
#if 0
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
#endif
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
#endif
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 );
#if 0
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
#endif
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
cython-blis-1.0.0/blis/_src/config/zen/bli_family_zen.h 0000664 0000000 0000000 00000006400 14634250137 0023024 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// By default, it is effective to parallelize the outer loops.
// Setting these macros to 1 will force JR and IR inner loops
// to be not paralleized.
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
#define BLIS_ENABLE_ZEN_BLOCK_SIZES
// Vanilla BLIS disables AMD's small matrix handling by default.
#if 0
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n)
#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128
#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96
#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128
//This macro will enable BLIS DGEMM to choose block sizes for a single instance mode
#define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22
#endif
#if 0
// Allow the sup implementation to combine some small edge case iterations in
// the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the
// block-panel algorithm (NR) with the last full iteration that precedes it.
// NOTE: These cpp macros need to be explicitly set to an integer since they
// are used at compile-time to create unconditional branches or dead code
// regions.
#define BLIS_ENABLE_SUP_MR_EXT 1
#define BLIS_ENABLE_SUP_NR_EXT 0
#endif
cython-blis-1.0.0/blis/_src/config/zen/make_defs.mk 0000664 0000000 0000000 00000006541 14634250137 0022145 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := zen
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
endif
# Flags specific to optimized and reference kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -O3
CROPTFLAGS := $(CKOPTFLAGS)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
ifeq ($(CC_VENDOR),gcc)
ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1.
CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store
endif
else
ifeq ($(CC_VENDOR),clang)
CVECFLAGS_VER := -march=znver1
else
ifeq ($(CC_VENDOR),aocc)
CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp
else
$(error gcc, clang, or aocc is required for this configuration.)
endif
endif
endif
CKVECFLAGS += $(CVECFLAGS_VER)
CRVECFLAGS += $(CVECFLAGS_VER)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/zen/old/ 0000775 0000000 0000000 00000000000 14634250137 0020446 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/zen/old/bli_kernel.h 0000664 0000000 0000000 00000016566 14634250137 0022743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_KERNEL_H
#define BLIS_KERNEL_H
// -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS ---------------------------
//
// Constraints:
//
// (1) MC must be a multiple of:
// (a) MR (for zero-padding purposes)
// (b) NR (for zero-padding purposes when MR and NR are "swapped")
// (2) NC must be a multiple of
// (a) NR (for zero-padding purposes)
// (b) MR (for zero-padding purposes when MR and NR are "swapped")
//
// threading related
// By default it is effective to paralleize the
// outerloops. Setting these macros to 1 will force
// JR and NR inner loops to be not paralleized.
#define BLIS_DEFAULT_MR_THREAD_MAX 1
#define BLIS_DEFAULT_NR_THREAD_MAX 1
// sgemm micro-kernel
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4
#define BLIS_DEFAULT_MC_S 264
#define BLIS_DEFAULT_KC_S 128
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 24
#define BLIS_DEFAULT_NR_S 4
#endif
#if 0
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 16
#define BLIS_DEFAULT_NR_S 6
#endif
#if 1
#define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16
#define BLIS_DEFAULT_MC_S 144
#define BLIS_DEFAULT_KC_S 256
#define BLIS_DEFAULT_NC_S 4080
#define BLIS_DEFAULT_MR_S 6
#define BLIS_DEFAULT_NR_S 16
#define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
// dgemm micro-kernel
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4
#define BLIS_DEFAULT_MC_D 96
#define BLIS_DEFAULT_KC_D 192
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 12
#define BLIS_DEFAULT_NR_D 4
#endif
#if 0
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6
#define BLIS_DEFAULT_MC_D 72
#define BLIS_DEFAULT_KC_D 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 8
#define BLIS_DEFAULT_NR_D 6
#endif
#if 1
#define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8
#define BLIS_DEFAULT_MC_D 510 // 72 /* Improves performance for large Matrices */
#define BLIS_DEFAULT_KC_D 1024 // 256
#define BLIS_DEFAULT_NC_D 4080
#define BLIS_DEFAULT_MR_D 6
#define BLIS_DEFAULT_NR_D 8
#define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
// cgemm micro-kernel
#if 1
#define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8
#define BLIS_DEFAULT_MC_C 144
#define BLIS_DEFAULT_KC_C 256
#define BLIS_DEFAULT_NC_C 4080
#define BLIS_DEFAULT_MR_C 3
#define BLIS_DEFAULT_NR_C 8
#define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
// zgemm micro-kernel
#if 1
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 3
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
// zgemm micro-kernel
#if 1
#define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4
#define BLIS_DEFAULT_MC_Z 72
#define BLIS_DEFAULT_KC_Z 256
#define BLIS_DEFAULT_NC_Z 4080
#define BLIS_DEFAULT_MR_Z 3
#define BLIS_DEFAULT_NR_Z 4
#define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS
#endif
// -- trsm-related --
#define BLIS_STRSM_L_UKERNEL bli_strsm_l_int_6x16
#define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_int_6x8
// --gemmtrsm-related --
#define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_6x16
#define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_6x8
#define BLIS_SMALL_MATRIX_ENABLE
//This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
gint_t bli_gemm_small_matrix
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl
);
// -- LEVEL-2 KERNEL CONSTANTS -------------------------------------------------
// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
#define BLIS_DEFAULT_1F_S 8
#define BLIS_DEFAULT_1F_D 4
// -- LEVEL-1F KERNEL DEFINITIONS ----------------------------------------------
// -- axpy2v --
// -- dotaxpyv --
// -- axpyf --
#define BLIS_SAXPYF_KERNEL bli_saxpyf_int_var1
#define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1
// -- dotxf --
#define BLIS_SDOTXF_KERNEL bli_sdotxf_int_var1
#define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1
// -- dotxaxpyf --
// -- LEVEL-1M KERNEL DEFINITIONS ----------------------------------------------
// -- packm --
// -- unpackm --
// -- LEVEL-1V KERNEL DEFINITIONS ----------------------------------------------
// -- amax --
#define BLIS_SAMAXV_KERNEL bli_samaxv_opt_var1
#define BLIS_DAMAXV_KERNEL bli_damaxv_opt_var1
// -- addv --
// -- axpyv --
#define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var10
#define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var10
// -- copyv --
// -- dotv --
#define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1
#define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1
// -- dotxv --
#define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1
#define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1
// -- invertv --
// -- scal2v --
// -- scalv --
#define BLIS_SSCALV_KERNEL bli_sscalv_opt_var2
#define BLIS_DSCALV_KERNEL bli_dscalv_opt_var2
// -- setv --
// -- subv --
// -- swapv --
#endif
cython-blis-1.0.0/blis/_src/config/zen2/ 0000775 0000000 0000000 00000000000 14634250137 0017752 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/zen2/bli_cntx_init_zen2.c 0000664 0000000 0000000 00000025123 14634250137 0023704 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_zen2( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen2_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
#if 1
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
//swap
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
//copy
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
#if AOCL_BLIS_MULTIINSTANCE
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 );
#else
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
#endif
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
#if 1
bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 );
#else
bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 );
#endif
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
1,
BLIS_GEMM, bli_gemmsup_ref,
cntx
);
#endif
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
#if 0
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
#endif
#if 0
// NOTE: This set of kernels is likely broken and therefore disabled.
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
#endif
cntx
);
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1,
9, 9, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
cython-blis-1.0.0/blis/_src/config/zen2/bli_family_zen2.h 0000664 0000000 0000000 00000007363 14634250137 0023201 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// By default, it is effective to parallelize the outer loops.
// Setting these macros to 1 will force JR and IR inner loops
// to be not paralleized.
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
// Vanilla BLIS disables AMD's small matrix handling by default.
#if 0
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n)
#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128
#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96
#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128
#define BLIS_ENABLE_SMALL_MATRIX_ROME
#define BLIS_SMALL_MATRIX_THRES_ROME 400
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50
// When running HPL with pure MPI without DGEMM threading (Single-threaded
// BLIS), defining this macro as 1 yields better performance.
#define AOCL_BLIS_MULTIINSTANCE 0
#endif
cython-blis-1.0.0/blis/_src/config/zen2/make_defs.mk 0000664 0000000 0000000 00000007556 14634250137 0022236 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := zen2
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O2 -fomit-frame-pointer
endif
# Flags specific to optimized and reference kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -O3
CROPTFLAGS := $(CKOPTFLAGS)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
ifeq ($(CC_VENDOR),gcc)
ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1.
CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp
else
ifeq ($(GCC_OT_9_1_0),yes) # gcc versions 6.1 or newer, but older than 9.1.
CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store
else # gcc versions 9.1 or newer.
CVECFLAGS_VER := -march=znver2
endif
endif
else
ifeq ($(CC_VENDOR),clang)
ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0.
CVECFLAGS_VER := -march=znver1
else # clang versions 9.0 or newer.
CVECFLAGS_VER := -march=znver2
endif
else
ifeq ($(CC_VENDOR),aocc)
ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0.
CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp
else # aocc versions 2.0 or newer.
CVECFLAGS_VER := -march=znver2
endif
else
$(error gcc, clang, or aocc is required for this configuration.)
endif
endif
endif
CKVECFLAGS += $(CVECFLAGS_VER)
CRVECFLAGS += $(CVECFLAGS_VER)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/config/zen3/ 0000775 0000000 0000000 00000000000 14634250137 0017753 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/config/zen3/bli_cntx_init_zen3.c 0000664 0000000 0000000 00000026157 14634250137 0023716 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_init_zen3( cntx_t* cntx )
{
blksz_t blkszs[ BLIS_NUM_BLKSZS ];
blksz_t thresh[ BLIS_NUM_THRESH ];
// Set default kernel blocksizes and functions.
bli_cntx_init_zen3_ref( cntx );
// -------------------------------------------------------------------------
// Update the context with optimized native gemm micro-kernels and
// their storage preferences.
bli_cntx_set_l3_nat_ukrs
(
8,
// gemm
BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE,
BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE,
BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE,
BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE,
// gemmtrsm_l
BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE,
// gemmtrsm_u
BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE,
cntx
);
#if 0
// AMD: This will be enabled in other PRs.
// packm kernels
bli_cntx_set_packm_kers
(
2,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
cntx
);
#else
// Update the context with optimized packm kernels.
bli_cntx_set_packm_kers
(
8,
BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk,
BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk,
BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk,
BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
cntx
);
#endif
// Update the context with optimized level-1f kernels.
bli_cntx_set_l1f_kers
(
4,
// axpyf
BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5,
BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5,
// dotxf
BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8,
BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
cntx
);
// Update the context with optimized level-1v kernels.
bli_cntx_set_l1v_kers
(
16,
// amaxv
BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int,
BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int,
// axpyv
BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10,
BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10,
// dotv
BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10,
BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10,
// dotxv
BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int,
BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int,
// scalv
BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10,
BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10,
//swap
BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8,
BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8,
//copy
BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int,
BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int,
//set
BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int,
BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int,
cntx
);
// Initialize level-3 blocksize objects with architecture-specific values.
//
// These are reference block sizes and may be overridden based on
// number of threads used at runtime.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 );
bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 );
bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 );
// Update the context with the current architecture's register and cache
// blocksizes (and multiples) for native execution.
bli_cntx_set_blkszs
(
BLIS_NAT, 7,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
// level-1f
BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
cntx
);
// -------------------------------------------------------------------------
// Initialize sup thresholds with architecture-appropriate values.
// s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 );
// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
(
3,
BLIS_MT, &thresh[ BLIS_MT ],
BLIS_NT, &thresh[ BLIS_NT ],
BLIS_KT, &thresh[ BLIS_KT ],
cntx
);
#if 0
// Initialize the context with the sup handlers.
bli_cntx_set_l3_sup_handlers
(
2,
BLIS_GEMM, bli_gemmsup_ref,
BLIS_GEMMT, bli_gemmtsup_ref,
cntx
);
#endif
#if 0
// AMD: This should be enabled in the PR which has added these kernels
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
28,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
cntx
);
#else
// Update the context with optimized small/unpacked gemm kernels.
bli_cntx_set_l3_sup_kers
(
16,
//BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
cntx
);
#endif
// Initialize level-3 sup blocksize objects with architecture-specific
// values.
// s d c z
bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3,
9, 9, 3, 3 );
bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 );
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 );
// Update the context with the current architecture's register and cache
// blocksizes for small/unpacked level-3 problems.
bli_cntx_set_l3_sup_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);
}
cython-blis-1.0.0/blis/_src/config/zen3/bli_family_zen3.h 0000664 0000000 0000000 00000007460 14634250137 0023201 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLI_FAMILY_ZEN3_
#define BLI_FAMILY_ZEN3_
// By default, it is effective to parallelize the outer loops.
// Setting these macros to 1 will force JR and IR inner loops
// to be not paralleized.
//
#define BLIS_THREAD_MAX_IR 1
#define BLIS_THREAD_MAX_JR 1
// To enable framework optimizations for zen3 platform
// All zen3 specific code should be included in this macro
#define BLIS_CONFIG_ZEN3
// To enable framework optimizations for zen3 platform
// All zen3 specific code should be included in this macro
#define BLIS_CONFIG_ZEN3
#define BLIS_ENABLE_SMALL_MATRIX
#define BLIS_ENABLE_SMALL_MATRIX_TRSM
// This will select the threshold below which small matrix code will be called.
#define BLIS_SMALL_MATRIX_THRES 700
#define BLIS_SMALL_M_RECT_MATRIX_THRES 160
#define BLIS_SMALL_K_RECT_MATRIX_THRES 128
#define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n)
#define BLIS_SMALL_MATRIX_A_THRES_TRSM 128
#define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96
#define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128
#define BLIS_ENABLE_SMALL_MATRIX_ROME
#define BLIS_SMALL_MATRIX_THRES_ROME 400
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120
#define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50
#endif
cython-blis-1.0.0/blis/_src/config/zen3/make_defs.mk 0000664 0000000 0000000 00000010100 14634250137 0022212 0 ustar 00root root 0000000 0000000 #
#
# BLIS
# An object-based framework for developing high-performance BLAS-like
# libraries.
#
# Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# - Neither the name(s) of the copyright holder(s) nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# Declare the name of the current configuration and add it to the
# running list of configurations included by common.mk.
THIS_CONFIG := zen3
#CONFIGS_INCL += $(THIS_CONFIG)
#
# --- Determine the C compiler and related flags ---
#
# NOTE: The build system will append these variables with various
# general-purpose/configuration-agnostic flags in common.mk. You
# may specify additional flags here as needed.
CPPROCFLAGS :=
CMISCFLAGS :=
CPICFLAGS :=
CWARNFLAGS :=
ifneq ($(DEBUG_TYPE),off)
CDBGFLAGS := -g
endif
ifeq ($(DEBUG_TYPE),noopt)
COPTFLAGS := -O0
else
COPTFLAGS := -O3
endif
# Flags specific to optimized and reference kernels.
# NOTE: The -fomit-frame-pointer option is needed for some kernels because
# they make explicit use of the rbp register.
CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer
CROPTFLAGS := $(CKOPTFLAGS)
CKVECFLAGS := -mavx2 -mfma -mfpmath=sse
CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast
ifeq ($(CC_VENDOR),gcc)
ifeq ($(GCC_OT_9_1_0),yes) # gcc versions older than 9.1.
CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store
else
ifeq ($(GCC_OT_10_1_0),yes) # gcc versions 9.1 or newer, but older than 10.1.
CVECFLAGS_VER := -march=znver2
else # gcc versions 10.1 or newer.
CVECFLAGS_VER := -march=znver3
endif
endif
else
ifeq ($(CC_VENDOR),clang)
ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0.
CVECFLAGS_VER := -march=znver1
else
ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0.
CVECFLAGS_VER := -march=znver2
else # clang versions 12.0 or newer.
CVECFLAGS_VER := -march=znver3
endif
endif
else
ifeq ($(CC_VENDOR),aocc)
ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0.
CVECFLAGS_VER := -march=znver1
else
ifeq ($(AOCC_OT_3_0_0),yes) # aocc versions 2.0 or newer, but older than 3.0.
CVECFLAGS_VER := -march=znver2
else # aocc versions 3.0 or newer.
CVECFLAGS_VER := -march=znver3
endif
endif
else
$(error gcc, clang, or aocc is required for this configuration.)
endif
endif
endif
CKVECFLAGS += $(CVECFLAGS_VER)
CRVECFLAGS += $(CVECFLAGS_VER)
# Store all of the variables here to new variables containing the
# configuration name.
$(eval $(call store-make-defs,$(THIS_CONFIG)))
cython-blis-1.0.0/blis/_src/frame/ 0000775 0000000 0000000 00000000000 14634250137 0016721 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/0/ 0000775 0000000 0000000 00000000000 14634250137 0017060 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/0/bli_l0.h 0000664 0000000 0000000 00000003541 14634250137 0020375 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l0_check.h"
#include "bli_l0_oapi.h"
#include "bli_l0_tapi.h"
#include "bli_l0_ft.h"
// Generate function pointer arrays for tapi functions.
#include "bli_l0_fpa.h"
// copysc
#include "bli_copysc.h"
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_check.c 0000664 0000000 0000000 00000021011 14634250137 0021515 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based check functions.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi, \
obj_t* psi \
) \
{ \
bli_l0_xxsc_check( chi, psi ); \
}
GENFRONT( addsc )
GENFRONT( copysc )
GENFRONT( divsc )
GENFRONT( mulsc )
GENFRONT( sqrtsc )
GENFRONT( subsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi \
) \
{ \
bli_l0_xsc_check( chi ); \
}
GENFRONT( invertsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi, \
obj_t* norm \
) \
{ \
bli_l0_xx2sc_check( chi, norm ); \
}
GENFRONT( absqsc )
GENFRONT( normfsc )
// -----------------------------------------------------------------------------
void bli_getsc_check
(
obj_t* chi,
double* zeta_r,
double* zeta_i
)
{
err_t e_val;
// Check object datatypes.
//e_val = bli_check_noninteger_object( chi );
//bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
}
void bli_setsc_check
(
double zeta_r,
double zeta_i,
obj_t* chi
)
{
err_t e_val;
// Check object datatypes.
//e_val = bli_check_floating_object( chi );
//bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
}
void bli_unzipsc_check
(
obj_t* chi,
obj_t* zeta_r,
obj_t* zeta_i
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_real_object( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_real_object( zeta_i );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( zeta_i );
bli_check_error_code( e_val );
e_val = bli_check_object_real_proj_of( chi, zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_object_real_proj_of( chi, zeta_i );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( zeta_i );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( zeta_i );
bli_check_error_code( e_val );
}
void bli_zipsc_check
(
obj_t* zeta_r,
obj_t* zeta_i,
obj_t* chi
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_real_object( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_real_object( zeta_i );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_object_real_proj_of( chi, zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_object_real_proj_of( chi, zeta_i );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( zeta_i );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( zeta_r );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( zeta_i );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
}
// -----------------------------------------------------------------------------
void bli_l0_xsc_check
(
obj_t* chi
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( chi );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
}
void bli_l0_xxsc_check
(
obj_t* chi,
obj_t* psi
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( psi );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( psi );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( psi );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( psi );
bli_check_error_code( e_val );
}
void bli_l0_xx2sc_check
(
obj_t* chi,
obj_t* absq
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( absq );
bli_check_error_code( e_val );
e_val = bli_check_real_object( absq );
bli_check_error_code( e_val );
e_val = bli_check_object_real_proj_of( chi, absq );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( absq );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( absq );
bli_check_error_code( e_val );
}
void bli_l0_xxbsc_check
(
obj_t* chi,
obj_t* psi,
bool* is_eq
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( psi );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( chi );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( psi );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( chi );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( psi );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_check.h 0000664 0000000 0000000 00000006602 14634250137 0021533 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi, \
obj_t* psi \
);
GENTPROT( addsc )
GENTPROT( copysc )
GENTPROT( divsc )
GENTPROT( mulsc )
GENTPROT( sqrtsc )
GENTPROT( subsc )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi \
);
GENTPROT( invertsc )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi, \
obj_t* absq \
);
GENTPROT( absqsc )
GENTPROT( normfsc )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi, \
double* zeta_r, \
double* zeta_i \
);
GENTPROT( getsc )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
double zeta_r, \
double zeta_i, \
obj_t* chi \
);
GENTPROT( setsc )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* chi, \
obj_t* zeta_r, \
obj_t* zeta_i \
);
GENTPROT( unzipsc )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* zeta_r, \
obj_t* zeta_i, \
obj_t* chi \
);
GENTPROT( zipsc )
// -----------------------------------------------------------------------------
void bli_l0_xsc_check
(
obj_t* chi
);
void bli_l0_xxsc_check
(
obj_t* chi,
obj_t* psi
);
void bli_l0_xx2sc_check
(
obj_t* chi,
obj_t* norm
);
void bli_l0_xxbsc_check
(
obj_t* chi,
obj_t* psi,
bool* is_eq
);
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_fpa.c 0000664 0000000 0000000 00000004506 14634250137 0021220 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA( PASTECH(opname,_vft), opname ); \
\
PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \
{ \
return PASTECH(opname,_fpa)[ dt ]; \
}
GENFRONT( absqsc )
GENFRONT( normfsc )
GENFRONT( addsc )
GENFRONT( divsc )
GENFRONT( mulsc )
GENFRONT( subsc )
GENFRONT( invertsc )
GENFRONT( sqrtsc )
GENFRONT( unzipsc )
GENFRONT( zipsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA_I( PASTECH(opname,_vft), opname ); \
\
PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \
{ \
return PASTECH(opname,_fpa)[ dt ]; \
}
GENFRONT( getsc )
GENFRONT( setsc )
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_fpa.h 0000664 0000000 0000000 00000003777 14634250137 0021236 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH(opname,_vft) \
PASTEMAC(opname,_qfp)( num_t dt );
GENPROT( absqsc )
GENPROT( normfsc )
GENPROT( addsc )
GENPROT( divsc )
GENPROT( mulsc )
GENPROT( subsc )
GENPROT( invertsc )
GENPROT( sqrtsc )
GENPROT( unzipsc )
GENPROT( zipsc )
GENPROT( getsc )
GENPROT( setsc )
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_ft.h 0000664 0000000 0000000 00000010030 14634250137 0021055 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Level-0 function types ---------------------------------------------------
//
// addsc, divsc, subsc
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
conj_t conjchi, \
ctype* chi, \
ctype* psi \
);
INSERT_GENTDEF( addsc )
INSERT_GENTDEF( divsc )
INSERT_GENTDEF( subsc )
// invertsc
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
conj_t conjchi, \
ctype* chi \
);
INSERT_GENTDEF( invertsc )
// mulsc
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
conj_t conjchi, \
ctype* chi, \
ctype* psi \
);
INSERT_GENTDEF( mulsc )
// absqsc
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
ctype* chi, \
ctype_r* absq \
);
INSERT_GENTDEFR( absqsc )
// normfsc
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
ctype* chi, \
ctype_r* norm \
);
INSERT_GENTDEFR( normfsc )
// sqrtsc
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
ctype* chi, \
ctype* psi \
);
INSERT_GENTDEF( sqrtsc )
// getsc
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
ctype* chi, \
double* zeta_r, \
double* zeta_i \
);
INSERT_GENTDEF( getsc )
// setsc
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
double zeta_r, \
double zeta_i, \
ctype* chi \
);
INSERT_GENTDEF( setsc )
// unzipsc
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
ctype* chi, \
ctype_r* zeta_r, \
ctype_r* zeta_i \
);
INSERT_GENTDEFR( unzipsc )
// zipsc
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH2(ch,opname,tsuf)) \
( \
ctype_r* zeta_r, \
ctype_r* zeta_i, \
ctype* chi \
);
INSERT_GENTDEFR( zipsc )
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_oapi.c 0000664 0000000 0000000 00000020574 14634250137 0021405 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* absq \
) \
{ \
bli_init_once(); \
\
num_t dt_chi; \
num_t dt_absq_c = bli_obj_dt_proj_to_complex( absq ); \
\
void* buf_chi; \
void* buf_absq = bli_obj_buffer_at_off( absq ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, absq ); \
\
/* If chi is a scalar constant, use dt_absq_c to extract the address of the
corresponding constant value; otherwise, use the datatype encoded
within the chi object and extract the buffer at the chi offset. */ \
bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
( \
buf_chi, \
buf_absq \
); \
}
GENFRONT( absqsc )
GENFRONT( normfsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( psi ); \
\
conj_t conjchi = bli_obj_conj_status( chi ); \
\
void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \
void* buf_psi = bli_obj_buffer_at_off( psi ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, psi ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
\
f \
( \
conjchi, \
buf_chi, \
buf_psi \
); \
}
GENFRONT( addsc )
GENFRONT( divsc )
GENFRONT( mulsc )
GENFRONT( subsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( chi ); \
\
conj_t conjchi = bli_obj_conj_status( chi ); \
\
void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
\
f \
( \
conjchi, \
buf_chi \
); \
}
GENFRONT( invertsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( psi ); \
\
void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \
void* buf_psi = bli_obj_buffer_at_off( psi ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, psi ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \
\
f \
( \
buf_chi, \
buf_psi \
); \
}
GENFRONT( sqrtsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi, \
double* zeta_r, \
double* zeta_i \
) \
{ \
bli_init_once(); \
\
num_t dt_chi = bli_obj_dt( chi ); \
num_t dt_def = BLIS_DCOMPLEX; \
num_t dt_use; \
\
/* If chi is a constant object, default to using the dcomplex
value to maximize precision, and since we don't know if the
caller needs just the real or the real and imaginary parts. */ \
void* buf_chi = bli_obj_buffer_for_1x1( dt_def, chi ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
\
/* The _check() routine prevents integer types, so we know that chi
is either a constant or an actual floating-point type. */ \
if ( bli_is_constant( dt_chi ) ) dt_use = dt_def; \
else dt_use = dt_chi; \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_use ); \
\
f \
( \
buf_chi, \
zeta_r, \
zeta_i \
); \
}
GENFRONT( getsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
double zeta_r, \
double zeta_i, \
obj_t* chi \
) \
{ \
bli_init_once(); \
\
num_t dt_chi = bli_obj_dt( chi ); \
\
void* buf_chi = bli_obj_buffer_at_off( chi ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
( \
zeta_r, \
zeta_i, \
buf_chi \
); \
}
GENFRONT( setsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* zeta_r, \
obj_t* zeta_i \
) \
{ \
bli_init_once(); \
\
num_t dt_chi; \
num_t dt_zeta_c = bli_obj_dt_proj_to_complex( zeta_r ); \
\
void* buf_chi; \
\
void* buf_zeta_r = bli_obj_buffer_at_off( zeta_r ); \
void* buf_zeta_i = bli_obj_buffer_at_off( zeta_i ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
\
/* If chi is a scalar constant, use dt_zeta_c to extract the address of the
corresponding constant value; otherwise, use the datatype encoded
within the chi object and extract the buffer at the chi offset. */ \
bli_obj_scalar_set_dt_buffer( chi, dt_zeta_c, &dt_chi, &buf_chi ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
( \
buf_chi, \
buf_zeta_r, \
buf_zeta_i \
); \
}
GENFRONT( unzipsc )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* zeta_r, \
obj_t* zeta_i, \
obj_t* chi \
) \
{ \
bli_init_once(); \
\
num_t dt_chi = bli_obj_dt( chi ); \
\
void* buf_zeta_r = bli_obj_buffer_for_1x1( dt_chi, zeta_r ); \
void* buf_zeta_i = bli_obj_buffer_for_1x1( dt_chi, zeta_i ); \
\
void* buf_chi = bli_obj_buffer_at_off( chi ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \
\
f \
( \
buf_zeta_i, \
buf_zeta_r, \
buf_chi \
); \
}
GENFRONT( zipsc )
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_oapi.h 0000664 0000000 0000000 00000006016 14634250137 0021405 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* absq \
);
GENPROT( absqsc )
GENPROT( normfsc )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
);
GENPROT( addsc )
GENPROT( divsc )
GENPROT( mulsc )
GENPROT( sqrtsc )
GENPROT( subsc )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi \
);
GENPROT( invertsc )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
double* zeta_r, \
double* zeta_i \
);
GENPROT( getsc )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
double zeta_r, \
double zeta_i, \
obj_t* chi \
);
GENPROT( setsc )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* zeta_r, \
obj_t* zeta_i \
);
GENPROT( unzipsc )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* zeta_r, \
obj_t* zeta_i, \
obj_t* chi \
);
GENPROT( zipsc )
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_tapi.c 0000664 0000000 0000000 00000013407 14634250137 0021407 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi, \
ctype* psi \
) \
{ \
bli_init_once(); \
\
ctype chi_conj; \
\
PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
PASTEMAC(ch,kername)( chi_conj, *psi ); \
}
INSERT_GENTFUNC_BASIC( addsc, adds )
INSERT_GENTFUNC_BASIC( divsc, invscals )
INSERT_GENTFUNC_BASIC( subsc, subs )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi \
) \
{ \
bli_init_once(); \
\
ctype chi_conj; \
\
PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
PASTEMAC(ch,kername)( chi_conj ); \
PASTEMAC(ch,copys)( chi_conj, *chi ); \
}
INSERT_GENTFUNC_BASIC( invertsc, inverts )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi, \
ctype* psi \
) \
{ \
bli_init_once(); \
\
if ( PASTEMAC(ch,eq0)( *chi ) ) \
{ \
/* Overwrite potential Infs and NaNs. */ \
PASTEMAC(ch,set0s)( *psi ); \
} \
else \
{ \
ctype chi_conj; \
\
PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \
PASTEMAC(ch,kername)( chi_conj, *psi ); \
} \
}
INSERT_GENTFUNC_BASIC( mulsc, scals )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* absq \
) \
{ \
bli_init_once(); \
\
ctype_r chi_r; \
ctype_r chi_i; \
ctype_r absq_i; \
\
( void )absq_i; \
\
PASTEMAC2(ch,chr,gets)( *chi, chi_r, chi_i ); \
\
/* absq = chi_r * chi_r + chi_i * chi_i; \
absq_r = 0.0; (thrown away) */ \
PASTEMAC(ch,absq2ris)( chi_r, chi_i, *absq, absq_i ); \
\
( void )chi_i; \
}
INSERT_GENTFUNCR_BASIC0( absqsc )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* norm \
) \
{ \
bli_init_once(); \
\
/* norm = sqrt( chi_r * chi_r + chi_i * chi_i ); */ \
PASTEMAC2(ch,chr,abval2s)( *chi, *norm ); \
}
INSERT_GENTFUNCR_BASIC0( normfsc )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype* psi \
) \
{ \
bli_init_once(); \
\
/* NOTE: sqrtsc/sqrt2s differs from normfsc/abval2s in the complex domain. */ \
PASTEMAC(ch,sqrt2s)( *chi, *psi ); \
}
INSERT_GENTFUNC_BASIC0( sqrtsc )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* chi, \
double* zeta_r, \
double* zeta_i \
) \
{ \
bli_init_once(); \
\
PASTEMAC2(ch,d,gets)( *chi, *zeta_r, *zeta_i ); \
}
INSERT_GENTFUNC_BASIC0( getsc )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
double zeta_r, \
double zeta_i, \
ctype* chi \
) \
{ \
bli_init_once(); \
\
PASTEMAC2(d,ch,sets)( zeta_r, zeta_i, *chi ); \
}
INSERT_GENTFUNC_BASIC0( setsc )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* zeta_r, \
ctype_r* zeta_i \
) \
{ \
bli_init_once(); \
\
PASTEMAC2(ch,chr,gets)( *chi, *zeta_r, *zeta_i ); \
}
INSERT_GENTFUNCR_BASIC0( unzipsc )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype_r* zeta_r, \
ctype_r* zeta_i, \
ctype* chi \
) \
{ \
bli_init_once(); \
\
PASTEMAC2(chr,ch,sets)( *zeta_r, *zeta_i, *chi ); \
}
INSERT_GENTFUNCR_BASIC0( zipsc )
// -----------------------------------------------------------------------------
void bli_igetsc
(
dim_t* chi,
double* zeta_r,
double* zeta_i
)
{
bli_init_once();
PASTEMAC2(i,d,gets)( *chi, *zeta_r, *zeta_i );
}
void bli_isetsc
(
double zeta_r,
double zeta_i,
dim_t* chi
)
{
bli_init_once();
PASTEMAC2(d,i,sets)( zeta_r, zeta_i, *chi );
}
cython-blis-1.0.0/blis/_src/frame/0/bli_l0_tapi.h 0000664 0000000 0000000 00000007611 14634250137 0021414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi, \
ctype* psi \
);
INSERT_GENTPROT_BASIC0( addsc )
INSERT_GENTPROT_BASIC0( divsc )
INSERT_GENTPROT_BASIC0( mulsc )
INSERT_GENTPROT_BASIC0( subsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
conj_t conjchi, \
ctype* chi \
);
INSERT_GENTPROT_BASIC0( invertsc )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* absq \
);
INSERT_GENTPROTR_BASIC0( absqsc )
INSERT_GENTPROTR_BASIC0( normfsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype* psi \
);
INSERT_GENTPROT_BASIC0( sqrtsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
double* zeta_r, \
double* zeta_i \
);
INSERT_GENTPROT_BASIC0( getsc )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
double zeta_r, \
double zeta_i, \
ctype* chi \
);
INSERT_GENTPROT_BASIC0( setsc )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype* chi, \
ctype_r* zeta_r, \
ctype_r* zeta_i \
);
INSERT_GENTPROTR_BASIC0( unzipsc )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
ctype_r* zeta_r, \
ctype_r* zeta_i, \
ctype* chi \
);
INSERT_GENTPROTR_BASIC0( zipsc )
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_igetsc
(
dim_t* chi,
double* zeta_r,
double* zeta_i
);
BLIS_EXPORT_BLIS void bli_isetsc
(
double zeta_r,
double zeta_i,
dim_t* chi
);
cython-blis-1.0.0/blis/_src/frame/0/copysc/ 0000775 0000000 0000000 00000000000 14634250137 0020360 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/0/copysc/bli_copysc.c 0000664 0000000 0000000 00000007260 14634250137 0022657 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// NOTE: This is one of the few functions in BLIS that is defined
// with heterogeneous type support. This is done so that we have
// an operation that can be used to typecast (copy-cast) a scalar
// of one datatype to a scalar of another datatype.
typedef void (*FUNCPTR_T)
(
conj_t conjchi,
void* chi,
void* psi
);
static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
) \
{ \
bli_init_once(); \
\
conj_t conjchi = bli_obj_conj_status( chi ); \
\
num_t dt_psi = bli_obj_dt( psi ); \
void* buf_psi = bli_obj_buffer_at_off( psi ); \
\
num_t dt_chi; \
void* buf_chi; \
\
FUNCPTR_T f; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( chi, psi ); \
\
/* If chi is a scalar constant, use dt_psi to extract the address of the
corresponding constant value; otherwise, use the datatype encoded
within the chi object and extract the buffer at the chi offset. */ \
bli_obj_scalar_set_dt_buffer( chi, dt_psi, &dt_chi, &buf_chi ); \
\
/* Index into the type combination array to extract the correct
function pointer. */ \
f = ftypes[dt_chi][dt_psi]; \
\
/* Invoke the void pointer-based function. */ \
f( \
conjchi, \
buf_chi, \
buf_psi \
); \
}
GENFRONT( copysc )
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname ) \
\
void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjchi, \
void* chi, \
void* psi \
) \
{ \
bli_init_once(); \
\
ctype_x* chi_cast = chi; \
ctype_y* psi_cast = psi; \
\
if ( bli_is_conj( conjchi ) ) \
{ \
PASTEMAC2(chx,chy,copyjs)( *chi_cast, *psi_cast ); \
} \
else \
{ \
PASTEMAC2(chx,chy,copys)( *chi_cast, *psi_cast ); \
} \
}
INSERT_GENTFUNC2_BASIC0( copysc )
INSERT_GENTFUNC2_MIX_D0( copysc )
INSERT_GENTFUNC2_MIX_P0( copysc )
cython-blis-1.0.0/blis/_src/frame/0/copysc/bli_copysc.h 0000664 0000000 0000000 00000004354 14634250137 0022665 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* chi, \
obj_t* psi \
);
GENFRONT( copysc )
//
// Prototype BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
( \
conj_t conjchi, \
void* chi, \
void* psi \
);
INSERT_GENTPROT2_BASIC0( copysc )
INSERT_GENTPROT2_MIX_D0( copysc )
INSERT_GENTPROT2_MIX_P0( copysc )
cython-blis-1.0.0/blis/_src/frame/1/ 0000775 0000000 0000000 00000000000 14634250137 0017061 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1/bli_l1v.h 0000664 0000000 0000000 00000005054 14634250137 0020566 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l1v_check.h"
// Define kernel function types.
//#include "bli_l1v_ft_ex.h"
#include "bli_l1v_ft_ker.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
#include "bli_l1v_oapi.h"
#include "bli_xapi_undef.h"
#include "bli_oapi_ba.h"
#include "bli_l1v_oapi.h"
#include "bli_xapi_undef.h"
// Prototype typed APIs (expert and non-expert).
#include "bli_tapi_ex.h"
#include "bli_l1v_tapi.h"
#include "bli_l1v_ft.h"
#include "bli_xapi_undef.h"
#include "bli_tapi_ba.h"
#include "bli_l1v_tapi.h"
#include "bli_l1v_ft.h"
#include "bli_xapi_undef.h"
// Generate function pointer arrays for tapi functions (expert only).
#include "bli_l1v_fpa.h"
// Pack-related
// NOTE: packv and unpackv are temporarily disabled.
//#include "bli_packv.h"
//#include "bli_unpackv.h"
// Other
// NOTE: scalv control tree code is temporarily disabled.
//#include "bli_scalv_cntl.h"
//#include "bli_scalv_int.h"
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_check.c 0000664 0000000 0000000 00000026702 14634250137 0021721 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based check functions.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y \
) \
{ \
bli_l1v_xy_check( x, y ); \
}
GENFRONT( addv )
GENFRONT( copyv )
GENFRONT( subv )
GENFRONT( swapv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* index \
) \
{ \
bli_l1v_xi_check( x, index ); \
}
GENFRONT( amaxv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
) \
{ \
bli_l1v_axby_check( alpha, x, beta, y ); \
}
GENFRONT( axpbyv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
) \
{ \
bli_l1v_axy_check( alpha, x, y ); \
}
GENFRONT( axpyv )
GENFRONT( scal2v )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y, \
obj_t* rho \
) \
{ \
bli_l1v_dot_check( &BLIS_ONE, x, y, &BLIS_ONE, rho ); \
}
GENFRONT( dotv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* beta, \
obj_t* rho \
) \
{ \
bli_l1v_dot_check( alpha, x, y, beta, rho ); \
}
GENFRONT( dotxv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x \
) \
{ \
bli_l1v_x_check( x ); \
}
GENFRONT( invertv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x \
) \
{ \
bli_l1v_ax_check( alpha, x ); \
}
GENFRONT( scalv )
GENFRONT( setv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
) \
{ \
bli_l1v_xby_check( x, beta, y ); \
}
GENFRONT( xpbyv )
// -----------------------------------------------------------------------------
void bli_l1v_xy_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1v_axy_check
(
obj_t* alpha,
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1v_xby_check
(
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1v_axby_check
(
obj_t* alpha,
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1v_dot_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( rho );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( rho );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( rho );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( rho );
bli_check_error_code( e_val );
}
void bli_l1v_x_check
(
obj_t* x
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
}
void bli_l1v_ax_check
(
obj_t* alpha,
obj_t* x
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
}
void bli_l1v_xi_check
(
obj_t* x,
obj_t* index
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_integer_object( index );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( index );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( index );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( index );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_check.h 0000664 0000000 0000000 00000010167 14634250137 0021724 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y \
);
GENTPROT( addv )
GENTPROT( copyv )
GENTPROT( subv )
GENTPROT( swapv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* index \
);
GENTPROT( amaxv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
);
GENTPROT( axpbyv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
);
GENTPROT( axpyv )
GENTPROT( scal2v )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y, \
obj_t* rho \
);
GENTPROT( dotv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* beta, \
obj_t* rho \
);
GENTPROT( dotxv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x \
);
GENTPROT( invertv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x \
);
GENTPROT( scalv )
GENTPROT( setv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
);
GENTPROT( xpbyv )
// -----------------------------------------------------------------------------
void bli_l1v_xy_check
(
obj_t* x,
obj_t* y
);
void bli_l1v_axy_check
(
obj_t* alpha,
obj_t* x,
obj_t* y
);
void bli_l1v_xby_check
(
obj_t* x,
obj_t* beta,
obj_t* y
);
void bli_l1v_axby_check
(
obj_t* alpha,
obj_t* x,
obj_t* beta,
obj_t* y
);
void bli_l1v_dot_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* beta,
obj_t* rho
);
void bli_l1v_x_check
(
obj_t* x
);
void bli_l1v_ax_check
(
obj_t* alpha,
obj_t* x
);
void bli_l1v_xi_check
(
obj_t* x,
obj_t* index
);
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_fpa.c 0000664 0000000 0000000 00000004412 14634250137 0021404 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
{ \
return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
}
GENFRONT( addv )
GENFRONT( copyv )
GENFRONT( subv )
GENFRONT( amaxv )
GENFRONT( axpbyv )
GENFRONT( axpyv )
GENFRONT( scal2v )
GENFRONT( dotv )
GENFRONT( dotxv )
GENFRONT( invertv )
GENFRONT( scalv )
GENFRONT( setv )
GENFRONT( swapv )
GENFRONT( xpbyv )
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_fpa.h 0000664 0000000 0000000 00000004073 14634250137 0021414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
GENPROT( addv )
GENPROT( copyv )
GENPROT( subv )
GENPROT( amaxv )
GENPROT( axpbyv )
GENPROT( axpyv )
GENPROT( scal2v )
GENPROT( dotv )
GENPROT( dotxv )
GENPROT( invertv )
GENPROT( scalv )
GENPROT( setv )
GENPROT( swapv )
GENPROT( xpbyv )
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_ft.h 0000664 0000000 0000000 00000011664 14634250137 0021263 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Level-1v function types --------------------------------------------------
//
// addv, copyv, subv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( addv )
INSERT_GENTDEF( copyv )
INSERT_GENTDEF( subv )
// amaxv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
dim_t n, \
ctype* x, inc_t incx, \
dim_t* index \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( amaxv )
// axpbyv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( axpbyv )
// axpyv, scal2v
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( axpyv )
INSERT_GENTDEF( scal2v )
// dotv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* rho \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( dotv )
// dotxv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* beta, \
ctype* rho \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( dotxv )
// invertv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
dim_t n, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( invertv )
// scalv, setv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjalpha, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( scalv )
INSERT_GENTDEF( setv )
// swapv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( swapv )
// xpybv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( xpbyv )
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_ft_ker.h 0000664 0000000 0000000 00000012527 14634250137 0022123 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L1V_FT_KER_H
#define BLIS_L1V_FT_KER_H
//
// -- Level-1v kernel function types -------------------------------------------
//
// addv, copyv, subv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( addv )
INSERT_GENTDEF( copyv )
INSERT_GENTDEF( subv )
// amaxv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
dim_t n, \
ctype* restrict x, inc_t incx, \
dim_t* restrict index, \
cntx_t* cntx \
);
INSERT_GENTDEF( amaxv )
// axpbyv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( axpbyv )
// axpyv, scal2v
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( axpyv )
INSERT_GENTDEF( scal2v )
// dotv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict rho, \
cntx_t* cntx \
);
INSERT_GENTDEF( dotv )
// dotxv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict beta, \
ctype* restrict rho, \
cntx_t* cntx \
);
INSERT_GENTDEF( dotxv )
// invertv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
dim_t n, \
ctype* restrict x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTDEF( invertv )
// scalv, setv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjalpha, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTDEF( scalv )
INSERT_GENTDEF( setv )
// swapv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( swapv )
// xpybv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( xpbyv )
#endif
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_ker.h 0000664 0000000 0000000 00000006314 14634250137 0021427 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1v kernels.
//
// Note: Instead of defining function prototype macro templates and then
// instantiating those macros to define the individual function prototypes,
// we simply alias the official operations' prototypes as defined in
// bli_l1v_ker_prot.h.
#undef GENTPROT
#define GENTPROT ADDV_KER_PROT
INSERT_GENTPROT_BASIC0( addv_ker_name )
#undef GENTPROT
#define GENTPROT AMAXV_KER_PROT
INSERT_GENTPROT_BASIC0( amaxv_ker_name )
#undef GENTPROT
#define GENTPROT AXPBYV_KER_PROT
INSERT_GENTPROT_BASIC0( axpbyv_ker_name )
#undef GENTPROT
#define GENTPROT AXPYV_KER_PROT
INSERT_GENTPROT_BASIC0( axpyv_ker_name )
#undef GENTPROT
#define GENTPROT COPYV_KER_PROT
INSERT_GENTPROT_BASIC0( copyv_ker_name )
#undef GENTPROT
#define GENTPROT DOTV_KER_PROT
INSERT_GENTPROT_BASIC0( dotv_ker_name )
#undef GENTPROT
#define GENTPROT DOTXV_KER_PROT
INSERT_GENTPROT_BASIC0( dotxv_ker_name )
#undef GENTPROT
#define GENTPROT INVERTV_KER_PROT
INSERT_GENTPROT_BASIC0( invertv_ker_name )
#undef GENTPROT
#define GENTPROT SCALV_KER_PROT
INSERT_GENTPROT_BASIC0( scalv_ker_name )
#undef GENTPROT
#define GENTPROT SCAL2V_KER_PROT
INSERT_GENTPROT_BASIC0( scal2v_ker_name )
#undef GENTPROT
#define GENTPROT SETV_KER_PROT
INSERT_GENTPROT_BASIC0( setv_ker_name )
#undef GENTPROT
#define GENTPROT SUBV_KER_PROT
INSERT_GENTPROT_BASIC0( subv_ker_name )
#undef GENTPROT
#define GENTPROT SWAPV_KER_PROT
INSERT_GENTPROT_BASIC0( swapv_ker_name )
#undef GENTPROT
#define GENTPROT XPBYV_KER_PROT
INSERT_GENTPROT_BASIC0( xpbyv_ker_name )
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_ker_prot.h 0000664 0000000 0000000 00000013173 14634250137 0022474 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1v kernels.
//
#define ADDV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
);
#define AMAXV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t n, \
ctype* restrict x, inc_t incx, \
dim_t* restrict index, \
cntx_t* restrict cntx \
); \
#define AXPBYV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
); \
#define AXPYV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
); \
#define COPYV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
);
#define DOTV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict rho, \
cntx_t* restrict cntx \
); \
#define DOTXV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict beta, \
ctype* restrict rho, \
cntx_t* restrict cntx \
); \
#define INVERTV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t n, \
ctype* restrict x, inc_t incx, \
cntx_t* restrict cntx \
); \
#define SCALV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjalpha, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
cntx_t* restrict cntx \
); \
#define SCAL2V_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
); \
#define SETV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjalpha, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
cntx_t* restrict cntx \
); \
#define SUBV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
);
#define SWAPV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
); \
#define XPBYV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
dim_t n, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
); \
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_oapi.c 0000664 0000000 0000000 00000034011 14634250137 0021564 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the object API macros.
#ifdef BLIS_ENABLE_OAPI
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, y ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
n, \
buf_x, inc_x, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( addv )
GENFRONT( copyv )
GENFRONT( subv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* index \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_index = bli_obj_buffer_at_off( index ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, index ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
n, \
buf_x, incx, \
buf_index, \
cntx, \
rntm \
); \
}
GENFRONT( amaxv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
void* buf_alpha; \
void* buf_beta; \
\
obj_t alpha_local; \
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
n, \
buf_alpha, \
buf_x, inc_x, \
buf_beta, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( axpbyv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
n, \
buf_alpha, \
buf_x, inc_x, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( axpyv )
GENFRONT( scal2v )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y, \
obj_t* rho \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
void* buf_rho = bli_obj_buffer_at_off( rho ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, y, rho ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
conjy, \
n, \
buf_x, inc_x, \
buf_y, inc_y, \
buf_rho, \
cntx, \
rntm \
); \
}
GENFRONT( dotv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* beta, \
obj_t* rho \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
void* buf_rho = bli_obj_buffer_at_off( rho ); \
\
void* buf_alpha; \
void* buf_beta; \
\
obj_t alpha_local; \
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, y, beta, rho ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
conjy, \
n, \
buf_alpha, \
buf_x, inc_x, \
buf_y, inc_y, \
buf_beta, \
buf_rho, \
cntx, \
rntm \
); \
}
GENFRONT( dotxv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
n, \
buf_x, inc_x, \
cntx, \
rntm \
); \
}
GENFRONT( invertv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
/* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \
n, \
buf_alpha, \
buf_x, inc_x, \
cntx, \
rntm \
); \
}
GENFRONT( scalv )
GENFRONT( setv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, y ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
n, \
buf_x, inc_x, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( swapv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
void* buf_beta; \
\
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
n, \
buf_x, inc_x, \
buf_beta, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( xpbyv )
#endif
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_oapi.h 0000664 0000000 0000000 00000007652 14634250137 0021604 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( addv )
GENTPROT( copyv )
GENTPROT( subv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* index \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( amaxv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( axpbyv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( axpyv )
GENTPROT( scal2v )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y, \
obj_t* rho \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( dotv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* beta, \
obj_t* rho \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( dotxv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( invertv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( scalv )
GENTPROT( setv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( swapv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( xpbyv )
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_oapi_ba.c 0000664 0000000 0000000 00000003670 14634250137 0022235 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_oapi_ba.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1v_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_oapi_ex.c 0000664 0000000 0000000 00000003666 14634250137 0022274 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_oapi_ex.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1v_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_tapi.c 0000664 0000000 0000000 00000021466 14634250137 0021603 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the typed API macros.
#ifdef BLIS_ENABLE_TAPI
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
x, incx, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( addv, BLIS_ADDV_KER )
INSERT_GENTFUNC_BASIC( copyv, BLIS_COPYV_KER )
INSERT_GENTFUNC_BASIC( subv, BLIS_SUBV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
dim_t n, \
ctype* x, inc_t incx, \
dim_t* index \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
n, \
x, incx, \
index, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
alpha, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) \
cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
alpha, \
x, incx, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( axpyv, BLIS_AXPYV_KER )
INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* rho \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
conjy, \
n, \
x, incx, \
y, incy, \
rho, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* beta, \
ctype* rho \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
beta, \
rho, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( dotxv, BLIS_DOTXV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
dim_t n, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
n, \
x, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjalpha, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjalpha, \
n, \
alpha, \
x, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( scalv, BLIS_SCALV_KER )
INSERT_GENTFUNC_BASIC( setv, BLIS_SETV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
n, \
x, incx, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
n, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( xpbyv, BLIS_XPBYV_KER )
#endif
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_tapi.h 0000664 0000000 0000000 00000011626 14634250137 0021605 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( addv )
INSERT_GENTPROT_BASIC0( copyv )
INSERT_GENTPROT_BASIC0( subv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
dim_t n, \
ctype* x, inc_t incx, \
dim_t* index \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( amaxv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( axpbyv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( axpyv )
INSERT_GENTPROT_BASIC0( scal2v )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* rho \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( dotv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* beta, \
ctype* rho \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( dotxv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
dim_t n, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( invertv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjalpha, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( scalv )
INSERT_GENTPROT_BASIC0( setv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
dim_t n, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( swapv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
dim_t n, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
); \
INSERT_GENTPROT_BASIC0( xpbyv )
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_tapi_ba.c 0000664 0000000 0000000 00000003666 14634250137 0022247 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_tapi_ba.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1v_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1/bli_l1v_tapi_ex.c 0000664 0000000 0000000 00000003664 14634250137 0022277 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_tapi_ex.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1v_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1/other/ 0000775 0000000 0000000 00000000000 14634250137 0020202 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1/other/packv/ 0000775 0000000 0000000 00000000000 14634250137 0021306 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv.c 0000664 0000000 0000000 00000003234 14634250137 0023406 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv.h 0000664 0000000 0000000 00000003433 14634250137 0023414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_packv_cntl.h"
#include "bli_packv_check.h"
#include "bli_packv_init.h"
#include "bli_packv_int.h"
#include "bli_packv_unb_var1.h"
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_check.c 0000664 0000000 0000000 00000003771 14634250137 0024551 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packv_check
(
obj_t* c,
obj_t* p,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( c );
bli_check_error_code( e_val );
// Check object dimensions.
// We don't check for conformal dimensions between c and p because
// p has not yet been initialized.
}
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_check.h 0000664 0000000 0000000 00000003345 14634250137 0024553 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packv_check
(
obj_t* c,
obj_t* p,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_cntl.c 0000664 0000000 0000000 00000005056 14634250137 0024432 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cntl_t* bli_packv_cntl_obj_create
(
void_fp var_func,
void_fp packv_var_func,
bszid_t bmid,
pack_t pack_schema,
cntl_t* sub_node
)
{
cntl_t* cntl;
packv_params_t* params;
// Allocate a packv_params_t struct.
params = bli_malloc_intl( sizeof( packv_params_t ) );
// Initialize the packv_params_t struct.
params->size = sizeof( packv_params_t );
params->packv_var_func = packv_var_func;
params->bmid = bmid;
params->pack_schema = pack_schema;
// It's important that we set the bszid field to BLIS_NO_PART to indicate
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_create_node
(
BLIS_NO_PART,
var_func,
params,
sub_node
);
return cntl;
}
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_cntl.h 0000664 0000000 0000000 00000004564 14634250137 0024442 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct packv_params_s
{
uint64_t size
packv_var_oft* var_func;
bszid_t bmid;
pack_t pack_schema;
};
typedef struct packv_params_s packv_params_t;
#define bli_cntl_packv_params_var_func( cntl ) \
\
( (packv_params_t*)( cntl->params )->var_func )
#define bli_cntl_packv_params_bmid( cntl ) \
\
( (packv_params_t*)( cntl->params )->bmid_m )
#define bli_cntl_packv_params_pack_schema( cntl ) \
\
( (packv_params_t*)( cntl->params )->pack_schema )
// -----------------------------------------------------------------------------
cntl_t* bli_packv_cntl_obj_create
(
void_fp var_func,
void_fp packv_var_func,
bszid_t bmid,
pack_t pack_schema,
cntl_t* sub_node
);
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_init.c 0000664 0000000 0000000 00000014537 14634250137 0024441 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packv_init
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
packv_t* cntl
)
{
// The purpose of packm_init() is to initialize an object P so that
// a source object A can be packed into P via one of the packv
// implementations. This initialization includes acquiring a suitable
// block of memory from the memory allocator, if such a block of memory
// has not already been allocated previously.
pack_t pack_schema;
bszid_t bmult_id;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packv_check( a, p, cntx );
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply alias the object to its packed counterpart.
if ( bli_cntl_is_noop( cntl ) )
{
bli_obj_alias_to( a, p );
return;
}
// At this point, we can be assured that cntl is not NULL. Let us now
// check to see if the object has already been packed to the desired
// schema (as encoded in the control tree). If so, we can alias and
// return, as above.
// Note that in most cases, bli_obj_pack_schema() will return
// BLIS_NOT_PACKED and thus packing will be called for (but in some
// cases packing has already taken place). Also, not all combinations
// of current pack status and desired pack schema are valid.
if ( bli_obj_pack_schema( a ) == cntl_pack_schema( cntl ) )
{
bli_obj_alias_to( a, p );
return;
}
// Now, if we are not skipping the pack operation, then the only question
// left is whether we are to typecast vector a before packing.
if ( bli_obj_dt( a ) != bli_obj_target_dt( a ) )
bli_abort();
// Extract various fields from the control tree and pass them in
// explicitly into _init_pack(). This allows external code generators
// the option of bypassing usage of control trees altogether.
pack_schema = cntl_pack_schema( cntl );
bmult_id = cntl_bmid( cntl );
// Initialize object p for the final packed vector.
bli_packv_init_pack
(
pack_schema,
bmult_id,
&a,
p,
cntx
);
// Now p is ready to be packed.
}
siz_t bli_packv_init_pack
(
pack_t schema,
bszid_t bmult_id,
obj_t* a,
obj_t* p,
cntx_t* cntx
)
{
num_t dt = bli_obj_dt( a );
dim_t dim_a = bli_obj_vector_dim( a );
dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx );
pba_t* pba = bli_cntx_pba( cntx );
#if 0
mem_t* mem_p;
#endif
dim_t m_p_pad;
siz_t size_p;
inc_t rs_p, cs_p;
void* buf;
// We begin by copying the basic fields of c.
bli_obj_alias_to( a, p );
// Update the dimensions.
bli_obj_set_dims( dim_a, 1, p );
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, p );
// Set the pack schema in the p object to the value in the control tree
// node.
bli_obj_set_pack_schema( schema, p );
// Compute the dimensions padded by the dimension multiples.
m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( p ), bmult );
// Compute the size of the packed buffer.
size_p = m_p_pad * 1 * bli_obj_elem_size( p );
#if 0
// Extract the address of the mem_t object within p that will track
// properties of the packed buffer.
mem_p = bli_obj_pack_mem( *p );
if ( bli_mem_is_unalloc( mem_p ) )
{
// If the mem_t object of p has not yet been allocated, then acquire
// a memory block suitable for a vector.
bli_pba_acquire_v( pba, size_p, mem_p );
}
else
{
// If the mem_t object has already been allocated, then release and
// re-acquire the memory so there is sufficient space.
if ( bli_mem_size( mem_p ) < size_p )
{
bli_pba_release( mem_p );
bli_pba_acquire_v( pba, size_p, mem_p );
}
}
// Grab the buffer address from the mem_t object and copy it to the
// main object buffer field. (Sometimes this buffer address will be
// copied when the value is already up-to-date, because it persists
// in the main object buffer field across loop iterations.)
buf = bli_mem_buffer( mem_p );
bli_obj_set_buffer( buf, p );
#endif
// Save the padded (packed) dimensions into the packed object.
bli_obj_set_padded_dims( m_p_pad, 1, p );
// Set the row and column strides of p based on the pack schema.
if ( schema == BLIS_PACKED_VECTOR )
{
// Set the strides to reflect a column-stored vector. Note that the
// column stride may never be used, and is only useful to determine
// how much space beyond the vector would need to be zero-padded, if
// zero-padding was needed.
rs_p = 1;
cs_p = bli_obj_padded_length( p );
bli_obj_set_strides( rs_p, cs_p, p );
}
return size_p;
}
#if 0
void bli_packv_release
(
obj_t* p,
packv_t* cntl
)
{
if ( !bli_cntl_is_noop( cntl ) )
bli_obj_release_pack( p );
}
#endif
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_init.h 0000664 0000000 0000000 00000003632 14634250137 0024440 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packv_init
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
packv_t* cntl
);
siz_t bli_packv_init_pack
(
pack_t pack_schema,
bszid_t bmult_id,
obj_t* a,
obj_t* p,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_int.c 0000664 0000000 0000000 00000010434 14634250137 0024260 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packv_fp
typedef void (*FUNCPTR_T)( obj_t* a,
obj_t* p,
cntx_t* cntx,
packv_t* cntl );
static FUNCPTR_T vars[1][3] =
{
// unblocked optimized unblocked blocked
{ bli_packv_unb_var1, NULL, NULL }
};
void bli_packv_int
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
cntl_t* cntl
)
{
#if 0
varnum_t n;
impl_t i;
#endif
packv_var_oft f;
// !!!
// DEFINE packv_var_oft type.
// !!!
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packv_check( a, p, cntx );
// Sanity check; A should never have a zero dimension. If we must support
// it, then we should fold it into the next alias-and-early-exit block.
//if ( bli_obj_has_zero_dim( a ) ) bli_abort();
// First check if we are to skip this operation because the control tree
// is NULL. We return without taking any action because a was already
// aliased to p in packv_init().
if ( bli_cntl_is_noop( cntl ) )
{
return;
}
// Let us now check to see if the object has already been packed. First
// we check if it has been packed to an unspecified (row or column)
// format, in which case we can return, since by now aliasing has already
// taken place in packv_init().
// NOTE: The reason we don't need to even look at the control tree in
// this case is as follows: an object's pack status is only set to
// BLIS_PACKED_UNSPEC for situations when the actual format used is
// not important, as long as its packed into contiguous rows or
// contiguous columns. A good example of this is packing for matrix
// operands in the level-2 operations.
if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC )
{
return;
}
// At this point, we can be assured that cntl is not NULL. Now we check
// if the object has already been packed to the desired schema (as en-
// coded in the control tree). If so, we can return, as above.
// NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED
// and thus packing will be called for (but in some cases packing has
// already taken place, or does not need to take place, and so that will
// be indicated by the pack status). Also, not all combinations of
// current pack status and desired pack schema are valid.
if ( bli_obj_pack_schema( a ) == cntl_pack_schema( cntl ) )
{
return;
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( a,
p,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_int.h 0000664 0000000 0000000 00000003375 14634250137 0024273 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packv_int
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
packv_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_unb_var1.c 0000664 0000000 0000000 00000006117 14634250137 0025206 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T packv_fp
typedef void (*FUNCPTR_T)(
dim_t m,
void* c, inc_t incc,
void* p, inc_t incp,
cntx_t* cntx
);
static FUNCPTR_T GENARRAY(ftypes,packv_unb_var1);
void bli_packv_unb_var1( obj_t* c,
obj_t* p,
cntx_t* cntx,
packv_t* cntl )
{
num_t dt_cp = bli_obj_dt( c );
dim_t dim_p = bli_obj_vector_dim( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t incc = bli_obj_vector_inc( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t incp = bli_obj_vector_inc( p );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f
(
dim_p,
buf_c, incc,
buf_p, incp,
cntx
);
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t m, \
void* c, inc_t incc, \
void* p, inc_t incp, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
\
copyv_p \
( \
BLIS_NO_CONJUGATE, \
m, \
c, incc, \
p, incp, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC0( packv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/1/other/packv/bli_packv_unb_var1.h 0000664 0000000 0000000 00000004044 14634250137 0025210 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packv_unb_var1( obj_t* c,
obj_t* p,
cntx_t* cntx,
packv_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t m, \
void* c, inc_t incc, \
void* p, inc_t incp, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( packv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/1/other/scalv/ 0000775 0000000 0000000 00000000000 14634250137 0021312 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1/other/scalv/bli_scalv_cntl.c 0000664 0000000 0000000 00000004565 14634250137 0024446 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
scalv_t* scalv_cntl = NULL;
void bli_scalv_cntl_init()
{
scalv_cntl = bli_scalv_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1 );
}
void bli_scalv_cntl_finalize()
{
bli_cntl_free_node( scalv_cntl );
}
scalv_t* bli_scalv_cntl_obj_create( impl_t impl_type,
varnum_t var_num )
{
scalv_t* cntl;
cntl = ( scalv_t* ) bli_malloc_intl( sizeof(scalv_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
return cntl;
}
void bli_scalv_cntl_obj_init( scalv_t* cntl,
impl_t impl_type,
varnum_t var_num )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
}
cython-blis-1.0.0/blis/_src/frame/1/other/scalv/bli_scalv_cntl.h 0000664 0000000 0000000 00000004213 14634250137 0024441 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct scalv_s
{
impl_t impl_type;
varnum_t var_num;
};
typedef struct scalv_s scalv_t;
#define bli_cntl_sub_scalv( cntl ) cntl->sub_scalv
void bli_scalv_cntl_init( void );
void bli_scalv_cntl_finalize( void );
scalv_t* bli_scalv_cntl_obj_create( impl_t impl_type,
varnum_t var_num );
void bli_scalv_cntl_obj_init( scalv_t* cntl,
impl_t impl_type,
varnum_t var_num );
cython-blis-1.0.0/blis/_src/frame/1/other/scalv/bli_scalv_int.c 0000664 0000000 0000000 00000005374 14634250137 0024277 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
typedef void (*FUNCPTR_T)( obj_t* alpha,
obj_t* x,
cntx_t* cntx );
static FUNCPTR_T vars[1][3] =
{
// unblocked optimized unblocked blocked
{ bli_scalv_ex, bli_scalv_ex, NULL }
};
void bli_scalv_int( obj_t* alpha,
obj_t* x,
cntx_t* cntx,
scalv_t* cntl )
{
varnum_t n;
impl_t i;
FUNCPTR_T f;
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( x ) ) return;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_scalv_check( alpha, x );
// First check if we are to skip this operation.
if ( bli_cntl_is_noop( cntl ) ) return;
// Return early if the alpha scalar equals one.
if ( bli_obj_equals( alpha, &BLIS_ONE ) ) return;
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( alpha,
x,
cntx );
}
cython-blis-1.0.0/blis/_src/frame/1/other/scalv/bli_scalv_int.h 0000664 0000000 0000000 00000003426 14634250137 0024300 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_scalv_int( obj_t* alpha,
obj_t* x,
cntx_t* cntx,
scalv_t* cntl );
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/ 0000775 0000000 0000000 00000000000 14634250137 0021651 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv.c 0000664 0000000 0000000 00000003234 14634250137 0024314 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv.h 0000664 0000000 0000000 00000003410 14634250137 0024315 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_unpackv_cntl.h"
#include "bli_unpackv_check.h"
#include "bli_unpackv_int.h"
#include "bli_unpackv_unb_var1.h"
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_check.c 0000664 0000000 0000000 00000004227 14634250137 0025454 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_unpackv_check
(
obj_t* p,
obj_t* a,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( p );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_equal_vector_lengths( p, a );
bli_check_error_code( e_val );
// Check pack status.
e_val = bli_check_packv_schema_on_unpack( p );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_check.h 0000664 0000000 0000000 00000003347 14634250137 0025463 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackv_check
(
obj_t* p,
obj_t* a,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_cntl.c 0000664 0000000 0000000 00000004632 14634250137 0025337 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
unpackv_t* unpackv_cntl = NULL;
void bli_unpackv_cntl_init()
{
unpackv_cntl = bli_unpackv_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1 );
}
void bli_unpackv_cntl_finalize()
{
bli_cntl_free_node( unpackv_cntl );
}
unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type,
varnum_t var_num )
{
unpackv_t* cntl;
cntl = ( unpackv_t* ) bli_malloc_intl( sizeof(unpackv_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
return cntl;
}
void bli_unpackv_cntl_obj_init( unpackv_t* cntl,
impl_t impl_type,
varnum_t var_num )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
}
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_cntl.h 0000664 0000000 0000000 00000004637 14634250137 0025351 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct unpackv_s
{
impl_t impl_type;
varnum_t var_num;
};
typedef struct unpackv_s unpackv_t;
#define bli_cntl_sub_unpackv( cntl ) cntl->sub_unpackv
#define bli_cntl_sub_unpackv_x( cntl ) cntl->sub_unpackv_x
#define bli_cntl_sub_unpackv_x1( cntl ) cntl->sub_unpackv_x1
#define bli_cntl_sub_unpackv_y( cntl ) cntl->sub_unpackv_y
#define bli_cntl_sub_unpackv_y1( cntl ) cntl->sub_unpackv_y1
void bli_unpackv_cntl_init( void );
void bli_unpackv_cntl_finalize( void );
unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type,
varnum_t var_num );
void bli_unpackv_cntl_obj_init( unpackv_t* cntl,
impl_t impl_type,
varnum_t var_num );
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_int.c 0000664 0000000 0000000 00000017430 14634250137 0025171 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T unpackv_fp
typedef void (*FUNCPTR_T)( obj_t* p,
obj_t* a,
cntx_t* cntx,
unpackv_t* cntl );
static FUNCPTR_T vars[1][3] =
{
// unblocked optimized unblocked blocked
{ bli_unpackv_unb_var1, NULL, NULL }
};
void bli_unpackv_int( obj_t* p,
obj_t* a,
cntx_t* cntx,
unpackv_t* cntl )
{
// The unpackv operation consists of an optional casting post-process.
// (This post-process is analogous to the cast pre-process in packv.)
// Here are the following possible ways unpackv can execute:
// 1. unpack and cast: Unpack to a temporary vector c and then cast
// c to a.
// 2. unpack only: Unpack directly to vector a since typecasting is
// not needed.
// 3. cast only: Not yet supported / not used.
// 4. no-op: The control tree directs us to skip the unpack operation
// entirely. No action is taken.
obj_t c;
varnum_t n;
impl_t i;
FUNCPTR_T f;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_unpackv_check( p, a, cntx );
// Sanity check; A should never have a zero dimension. If we must support
// it, then we should fold it into the next alias-and-early-exit block.
if ( bli_obj_has_zero_dim( a ) ) bli_abort();
// First check if we are to skip this operation because the control tree
// is NULL, and if so, simply return.
if ( bli_cntl_is_noop( cntl ) )
{
return;
}
// If p was aliased to a during the pack stage (because it was already
// in an acceptable packed/contiguous format), then no unpack is actually
// necessary, so we return.
if ( bli_obj_is_alias_of( p, a ) )
{
return;
}
// Now, if we are not skipping the unpack operation, then the only
// question left is whether we are to typecast vector a after unpacking.
if ( bli_obj_dt( p ) != bli_obj_dt( a ) )
bli_abort();
/*
if ( bli_obj_dt( p ) != bli_obj_dt( a ) )
{
// Initialize an object c for the intermediate typecast vector.
bli_unpackv_init_cast( p,
a,
&c );
}
else
*/
{
// If no cast is needed, then aliasing object c to the original
// vector serves as a minor optimization. This causes the unpackv
// implementation to unpack directly into vector a.
bli_obj_alias_to( a, &c );
}
// Now we are ready to proceed with the unpacking.
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( p,
&c,
cntx,
cntl );
// Now, if necessary, we cast the contents of c to vector a. If casting
// was not necessary, then we are done because the call to the unpackv
// implementation would have unpacked directly to vector a.
/*
if ( bli_obj_dt( p ) != bli_obj_dt( a ) )
{
// Copy/typecast vector c to vector a.
// NOTE: Here, we use copynzv instead of copym because, in the cases
// where we are unpacking/typecasting a real vector c to a complex
// vector a, we want to touch only the real components of a, rather
// than also set the imaginary components to zero. This comes about
// because of the fact that, if we are unpacking real-to-complex,
// then it is because all of the computation occurred in the real
// domain, and so we would want to leave whatever imaginary values
// there are in vector a untouched. Notice that for unpackings that
// entail complex-to-complex data movements, the copynzv operation
// behaves exactly as copym, so no use cases are lost (at least none
// that I can think of).
bli_copynzv( &c,
a );
// NOTE: The above code/comment is outdated. What should happen is
// as follows:
// - If dt(a) is complex and dt(p) is real, then create an alias of
// a and then tweak it so that it looks like a real domain object.
// This will involve:
// - projecting the datatype to real domain
// - scaling both the row and column strides by 2
// ALL OF THIS should be done in the front-end, NOT here, as
// unpackv() won't even be needed in that case.
}
*/
}
/*
void bli_unpackv_init_cast( obj_t* p,
obj_t* a,
obj_t* c )
{
// The idea here is that we want to create an object c that is identical
// to object a, except that:
// (1) the storage datatype of c is equal to the target datatype of a,
// with the element size of c adjusted accordingly,
// (2) object c is marked as being stored in a standard, contiguous
// format (ie: a column vector),
// (3) the view offset of c is reset to (0,0), and
// (4) object c's main buffer is set to a new memory region acquired
// from the memory manager, or extracted from p if a mem entry is
// already available. (After acquring a mem entry from the memory
// manager, it is cached within p for quick access later on.)
num_t dt_targ_a = bli_obj_target_dt( a );
dim_t dim_a = bli_obj_vector_dim( a );
siz_t elem_size_c = bli_dt_size( dt_targ_a );
// We begin by copying the basic fields of a.
bli_obj_alias_to( a, c );
// Update datatype and element size fields.
bli_obj_set_dt( dt_targ_a, c );
bli_obj_set_elem_size( elem_size_c, c );
// Update the strides and dimensions. We set the increments to reflect a
// column-stored vector. Note that the column stride is set to dim(a),
// though it should never be used because there is no second column to
// index into (and therefore it also does not need to be aligned).
bli_obj_set_dims( dim_a, 1, c );
bli_obj_set_strides( 1, dim_a, c );
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, c );
// Check the mem_t entry of p associated with the cast buffer. If it is
// NULL, then acquire memory sufficient to hold the object data and cache
// it to p. (Otherwise, if it is non-NULL, then memory has already been
// acquired from the memory manager and cached.) We then set the main
// buffer of c to the cached address of the cast memory.
bli_obj_set_buffer_with_cached_cast_mem( *p, *c );
}
*/
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_int.h 0000664 0000000 0000000 00000003637 14634250137 0025202 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackv_int( obj_t* p,
obj_t* a,
cntx_t* cntx,
unpackv_t* cntl );
/*
void bli_unpackv_init_cast( obj_t* p,
obj_t* a,
obj_t* c );
*/
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_unb_var1.c 0000664 0000000 0000000 00000006145 14634250137 0026115 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T unpackv_fp
typedef void (*FUNCPTR_T)(
dim_t m,
void* p, inc_t incp,
void* c, inc_t incc,
cntx_t* cntx
);
static FUNCPTR_T GENARRAY(ftypes,unpackv_unb_var1);
void bli_unpackv_unb_var1( obj_t* p,
obj_t* c,
cntx_t* cntx,
unpackv_t* cntl )
{
num_t dt_pc = bli_obj_dt( p );
dim_t dim_c = bli_obj_vector_dim( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t incp = bli_obj_vector_inc( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t incc = bli_obj_vector_inc( c );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_pc];
// Invoke the function.
f
(
dim_c,
buf_p, incp,
buf_c, incc,
cntx
);
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t m, \
void* p, inc_t incp, \
void* c, inc_t incc, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
\
copyv_p \
( \
BLIS_NO_CONJUGATE, \
m, \
p, incp, \
c, incc, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC0( unpackv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/1/other/unpackv/bli_unpackv_unb_var1.h 0000664 0000000 0000000 00000004066 14634250137 0026122 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackv_unb_var1( obj_t* p,
obj_t* c,
cntx_t* cntx,
unpackv_t* cntl );
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
dim_t m, \
void* p, inc_t incp, \
void* c, inc_t incc, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( unpackv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/1d/ 0000775 0000000 0000000 00000000000 14634250137 0017225 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d.h 0000664 0000000 0000000 00000004330 14634250137 0020704 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l1d_check.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
#include "bli_l1d_oapi.h"
#include "bli_xapi_undef.h"
#include "bli_oapi_ba.h"
#include "bli_l1d_oapi.h"
#include "bli_xapi_undef.h"
// Prototype typed APIs (expert and non-expert).
#include "bli_tapi_ex.h"
#include "bli_l1d_tapi.h"
#include "bli_l1d_ft.h"
#include "bli_xapi_undef.h"
#include "bli_tapi_ba.h"
#include "bli_l1d_tapi.h"
#include "bli_l1d_ft.h"
#include "bli_xapi_undef.h"
// Generate function pointer arrays for tapi functions (expert only).
#include "bli_l1d_fpa.h"
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_check.c 0000664 0000000 0000000 00000013425 14634250137 0022041 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based check functions.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y \
) \
{ \
bli_l1d_xy_check( x, y ); \
}
GENFRONT( addd )
GENFRONT( copyd )
GENFRONT( subd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
) \
{ \
bli_l1d_axy_check( alpha, x, y ); \
}
GENFRONT( axpyd )
GENFRONT( scal2d )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x \
) \
{ \
bli_l1d_x_check( x ); \
}
GENFRONT( invertd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x \
) \
{ \
bli_l1d_ax_check( alpha, x ); \
}
GENFRONT( scald )
GENFRONT( setd )
GENFRONT( setid )
GENFRONT( shiftd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
) \
{ \
bli_l1d_axy_check( beta, x, y ); \
}
GENFRONT( xpbyd )
// -----------------------------------------------------------------------------
void bli_l1d_xy_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( y );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1d_axy_check
(
obj_t* alpha,
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( y );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1d_x_check
(
obj_t* x
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
}
void bli_l1d_ax_check
(
obj_t* alpha,
obj_t* x
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_check.h 0000664 0000000 0000000 00000005726 14634250137 0022053 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y \
);
GENTPROT( addd )
GENTPROT( copyd )
GENTPROT( subd )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
);
GENTPROT( axpyd )
GENTPROT( scal2d )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x \
);
GENTPROT( invertd )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x \
);
GENTPROT( scald )
GENTPROT( setd )
GENTPROT( setid )
GENTPROT( shiftd )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
);
GENTPROT( xpbyd )
// -----------------------------------------------------------------------------
void bli_l1d_xy_check
(
obj_t* x,
obj_t* y
);
void bli_l1d_axy_check
(
obj_t* alpha,
obj_t* x,
obj_t* y
);
void bli_l1d_x_check
(
obj_t* x
);
void bli_l1d_ax_check
(
obj_t* alpha,
obj_t* x
);
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_fpa.c 0000664 0000000 0000000 00000004324 14634250137 0021530 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
{ \
return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
}
GENFRONT( addd )
GENFRONT( copyd )
GENFRONT( subd )
GENFRONT( axpyd )
GENFRONT( scal2d )
GENFRONT( invertd )
GENFRONT( scald )
GENFRONT( setd )
GENFRONT( setid )
GENFRONT( shiftd )
GENFRONT( xpbyd )
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_fpa.h 0000664 0000000 0000000 00000004011 14634250137 0021526 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
GENPROT( addd )
GENPROT( copyd )
GENPROT( subd )
GENPROT( axpyd )
GENPROT( scal2d )
GENPROT( invertd )
GENPROT( scald )
GENPROT( setd )
GENPROT( setid )
GENPROT( shiftd )
GENPROT( xpbyd )
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_ft.h 0000664 0000000 0000000 00000010605 14634250137 0021377 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Level-1d function types --------------------------------------------------
//
// addd, copyd, subd
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( addd )
INSERT_GENTDEF( copyd )
INSERT_GENTDEF( subd )
// axpyd, scal2d
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( axpyd )
INSERT_GENTDEF( scal2d )
// invertd
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( invertd )
// scald, setd
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( scald )
INSERT_GENTDEF( setd )
// setid
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype_r* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEFR( setid )
// shiftd
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( shiftd )
// xpbyd
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( xpbyd )
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_oapi.c 0000664 0000000 0000000 00000026343 14634250137 0021717 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the object API macros.
#ifdef BLIS_ENABLE_OAPI
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, y ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
diagx, \
transx, \
m, \
n, \
buf_x, rs_x, cs_x, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( addd )
GENFRONT( copyd )
GENFRONT( subd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
diagx, \
transx, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( axpyd )
GENFRONT( scal2d )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
dim_t m = bli_obj_length( x ); \
dim_t n = bli_obj_width( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
m, \
n, \
buf_x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
GENFRONT( invertd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
/* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \
doff_t diagoffx = bli_obj_diag_offset( x ); \
dim_t m = bli_obj_length( x ); \
dim_t n = bli_obj_width( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \
diagoffx, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
GENFRONT( scald )
GENFRONT( setd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
dim_t m = bli_obj_length( x ); \
dim_t n = bli_obj_width( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
GENFRONT( setid )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
dim_t m = bli_obj_length( x ); \
dim_t n = bli_obj_width( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
GENFRONT( shiftd )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
void* buf_beta; \
\
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
diagx, \
transx, \
m, \
n, \
buf_x, rs_x, cs_x, \
buf_beta, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( xpbyd )
#endif
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_oapi.h 0000664 0000000 0000000 00000005502 14634250137 0021716 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( addd )
GENTPROT( copyd )
GENTPROT( subd )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( axpyd )
GENTPROT( scal2d )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( invertd )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( scald )
GENTPROT( setd )
GENTPROT( setid )
GENTPROT( shiftd )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( xpbyd )
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_oapi_ba.c 0000664 0000000 0000000 00000003670 14634250137 0022357 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_oapi_ba.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1d_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_oapi_ex.c 0000664 0000000 0000000 00000003666 14634250137 0022416 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_oapi_ex.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1d_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_tapi.c 0000664 0000000 0000000 00000032105 14634250137 0021715 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the typed API macros.
#ifdef BLIS_ENABLE_TAPI
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x1; \
ctype* y1; \
conj_t conjx; \
dim_t n_elem; \
dim_t offx, offy; \
inc_t incx, incy; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_2d \
( \
diagoffx, transx, \
m, n, rs_x, cs_x, rs_y, cs_y, \
&offx, &offy, &n_elem, &incx, &incy \
); \
\
conjx = bli_extract_conj( transx ); \
\
if ( bli_is_nonunit_diag( diagx ) ) \
{ \
x1 = x + offx; \
y1 = y + offy; \
} \
else /* if ( bli_is_unit_diag( diagx ) ) */ \
{ \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
} \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjx, \
n_elem, \
x1, incx, \
y1, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER )
INSERT_GENTFUNC_BASIC2( copyd, copyv, BLIS_COPYV_KER )
INSERT_GENTFUNC_BASIC2( subd, subv, BLIS_SUBV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x1; \
ctype* y1; \
conj_t conjx; \
dim_t n_elem; \
dim_t offx, offy; \
inc_t incx, incy; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_2d \
( \
diagoffx, transx, \
m, n, rs_x, cs_x, rs_y, cs_y, \
&offx, &offy, &n_elem, &incx, &incy \
); \
\
conjx = bli_extract_conj( transx ); \
\
if ( bli_is_nonunit_diag( diagx ) ) \
{ \
x1 = x + offx; \
y1 = y + offy; \
} \
else /* if ( bli_is_unit_diag( diagx ) ) */ \
{ \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
} \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjx, \
n_elem, \
alpha, \
x1, incx, \
y1, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER )
INSERT_GENTFUNC_BASIC2( scal2d, scal2v, BLIS_SCAL2V_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x1; \
dim_t n_elem; \
dim_t offx; \
inc_t incx; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_1d \
( \
diagoffx, \
m, n, rs_x, cs_x, \
&offx, &n_elem, &incx \
); \
\
x1 = x + offx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
n_elem, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x1; \
dim_t n_elem; \
dim_t offx; \
inc_t incx; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_1d \
( \
diagoffx, \
m, n, rs_x, cs_x, \
&offx, &n_elem, &incx \
); \
\
x1 = x + offx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjalpha, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
INSERT_GENTFUNC_BASIC2( setd, setv, BLIS_SETV_KER )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype_r* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
const num_t dt_r = PASTEMAC(chr,type); \
\
ctype_r* x1; \
dim_t n_elem; \
dim_t offx; \
inc_t incx; \
\
/* If the datatype is real, the entire operation is a no-op. */ \
if ( bli_is_real( dt ) ) return; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_1d \
( \
diagoffx, \
m, n, rs_x, cs_x, \
&offx, &n_elem, &incx \
); \
\
/* Alternate implementation. (Substitute for remainder of function). */ \
/* for ( i = 0; i < n_elem; ++i ) \
{ \
ctype* chi11 = x1 + (i )*incx; \
\
PASTEMAC(ch,setis)( *alpha, *chi11 ); \
} */ \
\
/* Acquire the addres of the imaginary component of the first element,
and scale the increment for use in the real domain. Note that the
indexing into the imaginary field only needs to work for complex
datatypes since we return early for real domain types. */ \
x1 = ( ctype_r* )( x + offx ) + 1; \
incx = 2*incx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
BLIS_NO_CONJUGATE, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x1; \
dim_t n_elem; \
dim_t offx; \
inc_t incx; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_1d \
( \
diagoffx, \
m, n, rs_x, cs_x, \
&offx, &n_elem, &incx \
); \
\
x1 = x + offx; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
BLIS_NO_CONJUGATE, \
n_elem, \
alpha, 0, \
x1, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x1; \
ctype* y1; \
conj_t conjx; \
dim_t n_elem; \
dim_t offx, offy; \
inc_t incx, incy; \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \
\
/* Determine the distance to the diagonals, the number of diagonal
elements, and the diagonal increments. */ \
bli_set_dims_incs_2d \
( \
diagoffx, transx, \
m, n, rs_x, cs_x, rs_y, cs_y, \
&offx, &offy, &n_elem, &incx, &incy \
); \
\
conjx = bli_extract_conj( transx ); \
\
if ( bli_is_nonunit_diag( diagx ) ) \
{ \
x1 = x + offx; \
y1 = y + offy; \
} \
else /* if ( bli_is_unit_diag( diagx ) ) */ \
{ \
/* Simulate a unit diagonal for x with a zero increment over a unit
scalar. */ \
x1 = PASTEMAC(ch,1); \
incx = 0; \
y1 = y + offy; \
} \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Query the context for the operation's kernel address. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Invoke the kernel with the appropriate parameters. */ \
f( \
conjx, \
n_elem, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER )
#endif
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_tapi.h 0000664 0000000 0000000 00000010530 14634250137 0021720 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( addd )
INSERT_GENTPROT_BASIC0( copyd )
INSERT_GENTPROT_BASIC0( subd )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( axpyd )
INSERT_GENTPROT_BASIC0( scal2d )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( invertd )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( scald )
INSERT_GENTPROT_BASIC0( setd )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype_r* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROTR_BASIC0( setid )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( shiftd )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( xpbyd )
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_tapi_ba.c 0000664 0000000 0000000 00000003666 14634250137 0022371 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_tapi_ba.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1d_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1d/bli_l1d_tapi_ex.c 0000664 0000000 0000000 00000003664 14634250137 0022421 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_tapi_ex.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1d_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1f/ 0000775 0000000 0000000 00000000000 14634250137 0017227 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f.h 0000664 0000000 0000000 00000004426 14634250137 0020716 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l1f_check.h"
// Define kernel function types.
#include "bli_l1f_ft_ker.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
#include "bli_l1f_oapi.h"
#include "bli_xapi_undef.h"
#include "bli_oapi_ba.h"
#include "bli_l1f_oapi.h"
#include "bli_xapi_undef.h"
// Prototype typed APIs (expert and non-expert).
#include "bli_tapi_ex.h"
#include "bli_l1f_tapi.h"
#include "bli_l1f_ft.h"
#include "bli_xapi_undef.h"
#include "bli_tapi_ba.h"
#include "bli_l1f_tapi.h"
#include "bli_l1f_ft.h"
#include "bli_xapi_undef.h"
// Generate function pointer arrays for tapi functions (expert only).
#include "bli_l1f_fpa.h"
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_check.c 0000664 0000000 0000000 00000030374 14634250137 0022047 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based check functions.
//
void bli_axpy2v_check
(
obj_t* alphax,
obj_t* alphay,
obj_t* x,
obj_t* y,
obj_t* z
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alphax );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( alphay );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( z );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( x, z );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alphax );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( alphay );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( z );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, z );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alphax );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( alphay );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( z );
bli_check_error_code( e_val );
}
void bli_axpyf_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( x, bli_obj_width_after_trans( a ) );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( y, bli_obj_length_after_trans( a ) );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_dotaxpyv_check
(
obj_t* alpha,
obj_t* xt,
obj_t* x,
obj_t* y,
obj_t* rho,
obj_t* z
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( xt );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( rho );
bli_check_error_code( e_val );
e_val = bli_check_nonconstant_object( rho );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( z );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, xt );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( x, z );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( xt );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( rho );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( z );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, xt );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, z );
bli_check_error_code( e_val );
// Check object aliases.
e_val = bli_check_object_alias_of( xt, x );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( xt );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( rho );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( z );
bli_check_error_code( e_val );
}
void bli_dotxaxpyf_check
(
obj_t* alpha,
obj_t* at,
obj_t* a,
obj_t* w,
obj_t* x,
obj_t* beta,
obj_t* y,
obj_t* z
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( at );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( w );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( z );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, at );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, w );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, z );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( at );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( w );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( z );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( w, z );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( at, a );
bli_check_error_code( e_val );
e_val = bli_check_object_length_equals( at, bli_obj_vector_dim( w ) );
bli_check_error_code( e_val );
e_val = bli_check_object_width_equals( at, bli_obj_vector_dim( y ) );
bli_check_error_code( e_val );
e_val = bli_check_object_length_equals( a, bli_obj_vector_dim( z ) );
bli_check_error_code( e_val );
e_val = bli_check_object_width_equals( a, bli_obj_vector_dim( x ) );
bli_check_error_code( e_val );
// Check object aliases.
e_val = bli_check_object_alias_of( at, a );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( at );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( w );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( z );
bli_check_error_code( e_val );
}
void bli_dotxf_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( x, bli_obj_length_after_trans( a ) );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( y, bli_obj_width_after_trans( a ) );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_check.h 0000664 0000000 0000000 00000005533 14634250137 0022053 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alphax, \
obj_t* alphay, \
obj_t* x, \
obj_t* y, \
obj_t* z \
);
GENTPROT( axpy2v )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* y \
);
GENTPROT( axpyf )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* xt, \
obj_t* x, \
obj_t* y, \
obj_t* rho, \
obj_t* z \
);
GENTPROT( dotaxpyv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* at, \
obj_t* a, \
obj_t* w, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
obj_t* z \
);
GENTPROT( dotxaxpyf )
#undef GENTPROT
#define GENTPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
);
GENTPROT( dotxf )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_fpa.c 0000664 0000000 0000000 00000004157 14634250137 0021540 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
{ \
return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
}
GENFRONT( axpy2v )
GENFRONT( axpyf )
GENFRONT( dotaxpyv )
GENFRONT( dotxaxpyf )
GENFRONT( dotxf )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_fpa.h 0000664 0000000 0000000 00000003652 14634250137 0021544 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
GENPROT( axpy2v )
GENPROT( axpyf )
GENPROT( dotaxpyv )
GENPROT( dotxaxpyf )
GENPROT( dotxf )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_ft.h 0000664 0000000 0000000 00000007677 14634250137 0021422 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Level-1f function types --------------------------------------------------
//
// axpy2v
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alpha1, \
ctype* alpha2, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( axpy2v )
// axpyf
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( axpyf )
// dotaxpyv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* rho, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( dotaxpyv )
// dotxf
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( dotxf )
// dotxaxpyf
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* w, inc_t incw, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( dotxaxpyf )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_ft_ker.h 0000664 0000000 0000000 00000010560 14634250137 0022244 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L1F_FT_KER_H
#define BLIS_L1F_FT_KER_H
//
// -- Level-1f kernel function types -------------------------------------------
//
// axpy2v
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict alpha1, \
ctype* restrict alpha2, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict z, inc_t incz, \
cntx_t* cntx \
);
INSERT_GENTDEF( axpy2v )
// axpyf
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( axpyf )
// dotaxpyv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict rho, \
ctype* restrict z, inc_t incz, \
cntx_t* cntx \
);
INSERT_GENTDEF( dotaxpyv )
// dotxf
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( dotxf )
// dotxaxpyf
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict w, inc_t incw, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
ctype* restrict z, inc_t incz, \
cntx_t* cntx \
);
INSERT_GENTDEF( dotxaxpyf )
#endif
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_ker.h 0000664 0000000 0000000 00000004627 14634250137 0021562 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1f kernels.
//
// Note: Instead of defining function prototype macro templates and then
// instantiating those macros to define the individual function prototypes,
// we simply alias the official operations' prototypes as defined in
// bli_l1f_ker_prot.h.
#undef GENTPROT
#define GENTPROT AXPY2V_KER_PROT
INSERT_GENTPROT_BASIC0( axpy2v_ker_name )
#undef GENTPROT
#define GENTPROT AXPYF_KER_PROT
INSERT_GENTPROT_BASIC0( axpyf_ker_name )
#undef GENTPROT
#define GENTPROT DOTAXPYV_KER_PROT
INSERT_GENTPROT_BASIC0( dotaxpyv_ker_name )
#undef GENTPROT
#define GENTPROT DOTXAXPYF_KER_PROT
INSERT_GENTPROT_BASIC0( dotxaxpyf_ker_name )
#undef GENTPROT
#define GENTPROT DOTXF_KER_PROT
INSERT_GENTPROT_BASIC0( dotxf_ker_name )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_ker_prot.h 0000664 0000000 0000000 00000007741 14634250137 0022626 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1f kernels.
//
#define AXPY2V_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict alphax, \
ctype* restrict alphay, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict z, inc_t incz, \
cntx_t* restrict cntx \
);
#define AXPYF_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
);
#define DOTAXPYV_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* restrict alpha, \
ctype* restrict x, inc_t incx, \
ctype* restrict y, inc_t incy, \
ctype* restrict rho, \
ctype* restrict z, inc_t incz, \
cntx_t* restrict cntx \
);
#define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict w, inc_t incw, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
ctype* restrict z, inc_t incz, \
cntx_t* restrict cntx \
);
#define DOTXF_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* restrict alpha, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict x, inc_t incx, \
ctype* restrict beta, \
ctype* restrict y, inc_t incy, \
cntx_t* restrict cntx \
);
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_oapi.c 0000664 0000000 0000000 00000027551 14634250137 0021725 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the object API macros.
#ifdef BLIS_ENABLE_OAPI
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alphax, \
obj_t* alphay, \
obj_t* x, \
obj_t* y, \
obj_t* z \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
void* buf_z = bli_obj_buffer_at_off( z ); \
inc_t inc_z = bli_obj_vector_inc( z ); \
\
void* buf_alphax; \
void* buf_alphay; \
\
obj_t alphax_local; \
obj_t alphay_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alphax, alphay, x, y, z ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alphax, &alphax_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alphay, &alphay_local ); \
buf_alphax = bli_obj_buffer_for_1x1( dt, &alphax_local ); \
buf_alphay = bli_obj_buffer_for_1x1( dt, &alphay_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
conjy, \
n, \
buf_alphax, \
buf_alphay, \
buf_x, inc_x, \
buf_y, inc_y, \
buf_z, inc_z, \
cntx, \
rntm \
); \
}
GENFRONT( axpy2v )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conja = bli_obj_conj_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
dim_t m = bli_obj_vector_dim( y ); \
dim_t b_n = bli_obj_vector_dim( x ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, x, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Support cases where matrix A requires a transposition. */ \
if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conja, \
conjx, \
m, \
b_n, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, inc_x, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( axpyf )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* xt, \
obj_t* x, \
obj_t* y, \
obj_t* rho, \
obj_t* z \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjxt = bli_obj_conj_status( xt ); \
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
dim_t n = bli_obj_vector_dim( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
void* buf_z = bli_obj_buffer_at_off( z ); \
inc_t inc_z = bli_obj_vector_inc( z ); \
void* buf_rho = bli_obj_buffer_at_off( rho ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, xt, x, y, rho, z ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjxt, \
conjx, \
conjy, \
n, \
buf_alpha, \
buf_x, inc_x, \
buf_y, inc_y, \
buf_rho, \
buf_z, inc_z, \
cntx, \
rntm \
); \
}
GENFRONT( dotaxpyv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* at, \
obj_t* a, \
obj_t* w, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
obj_t* z \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjat = bli_obj_conj_status( at ); \
conj_t conja = bli_obj_conj_status( a ); \
conj_t conjw = bli_obj_conj_status( w ); \
conj_t conjx = bli_obj_conj_status( x ); \
dim_t m = bli_obj_vector_dim( z ); \
dim_t b_n = bli_obj_vector_dim( y ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
void* buf_w = bli_obj_buffer_at_off( w ); \
inc_t inc_w = bli_obj_vector_inc( w ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
void* buf_z = bli_obj_buffer_at_off( z ); \
inc_t inc_z = bli_obj_vector_inc( z ); \
\
void* buf_alpha; \
void* buf_beta; \
\
obj_t alpha_local; \
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Support cases where matrix A requires a transposition. */ \
if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjat, \
conja, \
conjw, \
conjx, \
m, \
b_n, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_w, inc_w, \
buf_x, inc_x, \
buf_beta, \
buf_y, inc_y, \
buf_z, inc_z, \
cntx, \
rntm \
); \
}
GENFRONT( dotxaxpyf )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
conj_t conjat = bli_obj_conj_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
dim_t m = bli_obj_vector_dim( x ); \
dim_t b_n = bli_obj_vector_dim( y ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t inc_x = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t inc_y = bli_obj_vector_inc( y ); \
\
void* buf_alpha; \
void* buf_beta; \
\
obj_t alpha_local; \
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Support cases where matrix A requires a transposition. */ \
if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjat, \
conjx, \
m, \
b_n, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, inc_x, \
buf_beta, \
buf_y, inc_y, \
cntx, \
rntm \
); \
}
GENFRONT( dotxf )
#endif
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_oapi.h 0000664 0000000 0000000 00000006106 14634250137 0021723 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alphax, \
obj_t* alphay, \
obj_t* x, \
obj_t* y, \
obj_t* z \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( axpy2v )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( axpyf )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* xt, \
obj_t* x, \
obj_t* y, \
obj_t* rho, \
obj_t* z \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( dotaxpyv )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* at, \
obj_t* a, \
obj_t* w, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
obj_t* z \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( dotxaxpyf )
#undef GENTPROT
#define GENTPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENTPROT( dotxf )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_oapi_ba.c 0000664 0000000 0000000 00000003670 14634250137 0022363 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_oapi_ba.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1f_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_oapi_ex.c 0000664 0000000 0000000 00000003666 14634250137 0022422 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_oapi_ex.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1f_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_tapi.c 0000664 0000000 0000000 00000014362 14634250137 0021726 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the typed API macros.
#ifdef BLIS_ENABLE_TAPI
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alphax, \
ctype* alphay, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjx, \
conjy, \
n, \
alphax, \
alphay, \
x, incx, \
y, incy, \
z, incz, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conja, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* rho, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjxt, \
conjx, \
conjy, \
n, \
alpha, \
x, incx, \
y, incy, \
rho, \
z, incz, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* w, inc_t incw, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjat, \
conja, \
conjw, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
w, incw, \
x, incx, \
beta, \
y, incy, \
z, incz, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kerid ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
\
f \
( \
conjat, \
conjx, \
m, \
b_n, \
alpha, \
a, inca, lda, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC( dotxf, BLIS_DOTXF_KER )
#endif
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_tapi.h 0000664 0000000 0000000 00000007614 14634250137 0021735 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alphax, \
ctype* alphay, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( axpy2v )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conja, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( axpyf )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjxt, \
conj_t conjx, \
conj_t conjy, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* rho, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( dotaxpyv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjat, \
conj_t conja, \
conj_t conjw, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* w, inc_t incw, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
ctype* z, inc_t incz \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( dotxaxpyf )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjat, \
conj_t conjx, \
dim_t m, \
dim_t b_n, \
ctype* alpha, \
ctype* a, inc_t inca, inc_t lda, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( dotxf )
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_tapi_ba.c 0000664 0000000 0000000 00000003666 14634250137 0022375 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_tapi_ba.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1f_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1f/bli_l1f_tapi_ex.c 0000664 0000000 0000000 00000003664 14634250137 0022425 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_tapi_ex.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1f_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1m/ 0000775 0000000 0000000 00000000000 14634250137 0017236 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m.h 0000664 0000000 0000000 00000004751 14634250137 0020735 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l1m_check.h"
// Define kernel function types.
#include "bli_l1m_ft_ker.h"
// Define object function types for variants.
#include "bli_l1m_oft_var.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
#include "bli_l1m_oapi.h"
#include "bli_xapi_undef.h"
#include "bli_oapi_ba.h"
#include "bli_l1m_oapi.h"
#include "bli_xapi_undef.h"
// Prototype typed APIs (expert and non-expert).
#include "bli_tapi_ex.h"
#include "bli_l1m_tapi.h"
#include "bli_l1m_ft.h"
#include "bli_xapi_undef.h"
#include "bli_tapi_ba.h"
#include "bli_l1m_tapi.h"
#include "bli_l1m_ft.h"
#include "bli_xapi_undef.h"
// Generate function pointer arrays for tapi functions (expert only).
#include "bli_l1m_fpa.h"
// Prototype level-1m implementations.
#include "bli_l1m_unb_var1.h"
// Pack-related
#include "bli_packm.h"
#include "bli_unpackm.h"
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_check.c 0000664 0000000 0000000 00000012443 14634250137 0022062 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based check functions.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y \
) \
{ \
bli_l1m_xy_check( x, y ); \
}
GENFRONT( addm )
GENFRONT( copym )
GENFRONT( subm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
) \
{ \
bli_l1m_axy_check( alpha, x, y ); \
}
GENFRONT( axpym )
GENFRONT( scal2m )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x \
) \
{ \
bli_l1m_ax_check( alpha, x ); \
}
GENFRONT( scalm )
GENFRONT( setm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
) \
{ \
bli_l1m_axy_check( beta, x, y ); \
}
GENFRONT( xpbym )
// -----------------------------------------------------------------------------
void bli_l1m_xy_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( y );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1m_axy_check
(
obj_t* alpha,
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( y );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_l1m_ax_check
(
obj_t* alpha,
obj_t* x
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( x );
bli_check_error_code( e_val );
// Check object properties.
//e_val = bli_check_nonunit_diag( x );
//bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_check.h 0000664 0000000 0000000 00000005340 14634250137 0022065 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* y \
);
GENPROT( addm )
GENPROT( copym )
GENPROT( subm )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
);
GENPROT( axpym )
GENPROT( scal2m )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x \
);
GENPROT( scalm )
GENPROT( setm )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
);
GENPROT( xpbym )
// -----------------------------------------------------------------------------
void bli_l1m_xy_check
(
obj_t* x,
obj_t* y
);
void bli_l1m_axy_check
(
obj_t* alpha,
obj_t* x,
obj_t* y
);
void bli_l1m_ax_check
(
obj_t* alpha,
obj_t* x
);
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_fpa.c 0000664 0000000 0000000 00000005125 14634250137 0021552 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
{ \
return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
}
GENFRONT( addm )
GENFRONT( copym )
GENFRONT( subm )
GENFRONT( axpym )
GENFRONT( scal2m )
GENFRONT( scalm )
GENFRONT( setm )
GENFRONT( xpbym )
//
// Define function pointer query interfaces for two-datatype operations.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA2( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ) \
{ \
return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa2)[ dtx ][ dty ]; \
}
GENFRONT( xpbym_md )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_fpa.h 0000664 0000000 0000000 00000004202 14634250137 0021552 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
GENPROT( addm )
GENPROT( copym )
GENPROT( subm )
GENPROT( axpym )
GENPROT( scal2m )
GENPROT( scalm )
GENPROT( setm )
GENPROT( xpbym )
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty );
GENPROT( xpbym_md )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_ft.h 0000664 0000000 0000000 00000007715 14634250137 0021431 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Level-1v function types --------------------------------------------------
//
// addm, subm
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( addm )
INSERT_GENTDEF( subm )
INSERT_GENTDEF( copym )
// axpym
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( axpym )
// scal2m
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( scal2m )
// scalm, setm
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( scalm )
INSERT_GENTDEF( setm )
// xpbym
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( xpbym )
INSERT_GENTDEF( xpbym_md )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_ft_ker.h 0000664 0000000 0000000 00000010315 14634250137 0022260 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L1M_FT_KER_H
#define BLIS_L1M_FT_KER_H
//
// -- Level-1m kernel function types -------------------------------------------
//
// packm
// NOTE: This is the function type for the structure-aware "kernel".
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
);
INSERT_GENTDEF( packm )
// NOTE: the following macros generate packm kernel function type definitions
// that are "ctyped" and void-typed, for each of the floating-point datatypes.
// packm_ker
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( packm_cxk )
// unpackm_ker
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conjp, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict p, inc_t ldp, \
ctype* restrict a, inc_t inca, inc_t lda, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( unpackm_cxk )
// packm_1er_ker
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( packm_cxk_1er )
#endif
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_ker.h 0000664 0000000 0000000 00000006563 14634250137 0021601 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1m kernels.
//
// Note: Instead of defining function prototype macro templates and then
// instantiating those macros to define the individual function prototypes,
// we simply alias the official operations' prototypes as defined in
// bli_l1m_ker_prot.h.
// native packm kernels
#undef GENTPROT
#define GENTPROT PACKM_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_3xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_ker_name )
INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
// native unpackm kernels
#undef GENTPROT
#define GENTPROT UNPACKM_KER_PROT
INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name )
INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
// 1e/1r packm kernels
#undef GENTPROT
#define GENTPROT PACKM_1ER_KER_PROT
INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name )
INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_ker_prot.h 0000664 0000000 0000000 00000005661 14634250137 0022643 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-1m kernels.
//
// native packm kernels
#define PACKM_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
);
// native unpackm kernels
#define UNPACKM_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict p, inc_t ldp, \
ctype* restrict a, inc_t inca, inc_t lda, \
cntx_t* restrict cntx \
);
// 1e/1r packm kernels
#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t cdim, \
dim_t n, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict a, inc_t inca, inc_t lda, \
ctype* restrict p, inc_t ldp, \
cntx_t* restrict cntx \
);
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_oapi.c 0000664 0000000 0000000 00000027474 14634250137 0021747 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the object API macros.
#ifdef BLIS_ENABLE_OAPI
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
uplo_t uplox = bli_obj_uplo( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, y ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
buf_x, rs_x, cs_x, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( addm )
GENFRONT( copym )
GENFRONT( subm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
uplo_t uplox = bli_obj_uplo( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( axpym )
GENFRONT( scal2m )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
/* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
uplo_t uplox = bli_obj_uplo( x ); \
dim_t m = bli_obj_length( x ); \
dim_t n = bli_obj_width( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
obj_t x_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x ); \
\
/* Alias x to x_local so we can apply alpha if it is non-unit. */ \
bli_obj_alias_to( x, &x_local ); \
\
/* If alpha is non-unit, apply it to the scalar attached to x. */ \
if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) \
{ \
/* Create a local copy-cast of alpha (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
\
bli_obj_scalar_apply_scalar( &alpha_local, &x_local ); \
} \
\
/* Grab the address of the internal scalar buffer for the scalar
attached to x. */ \
buf_alpha = bli_obj_internal_scalar_buffer( &x_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \
diagoffx, \
diagx, \
uplox, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
GENFRONT( scalm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( x ); \
\
/* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
uplo_t uplox = bli_obj_uplo( x ); \
dim_t m = bli_obj_length( x ); \
dim_t n = bli_obj_width( x ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \
diagoffx, \
diagx, \
uplox, \
m, \
n, \
buf_alpha, \
buf_x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
GENFRONT( setm )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
if ( bli_obj_dt( x ) != bli_obj_dt( y ) ) \
return bli_xpbym_md( x, beta, y ); \
\
num_t dt = bli_obj_dt( x ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
uplo_t uplox = bli_obj_uplo( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
void* buf_beta; \
\
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
buf_x, rs_x, cs_x, \
buf_beta, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( xpbym )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dtx = bli_obj_dt( x ); \
num_t dty = bli_obj_dt( y ); \
\
doff_t diagoffx = bli_obj_diag_offset( x ); \
diag_t diagx = bli_obj_diag( x ); \
uplo_t uplox = bli_obj_uplo( x ); \
trans_t transx = bli_obj_conjtrans_status( x ); \
dim_t m = bli_obj_length( y ); \
dim_t n = bli_obj_width( y ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t rs_x = bli_obj_row_stride( x ); \
inc_t cs_x = bli_obj_col_stride( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t rs_y = bli_obj_row_stride( y ); \
inc_t cs_y = bli_obj_col_stride( y ); \
\
void* buf_beta; \
\
obj_t beta_local; \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dty, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_beta = bli_obj_buffer_for_1x1( dty, &beta_local ); \
\
/* Query a (multi) type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( dtx, dty ); \
\
f \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
buf_x, rs_x, cs_x, \
buf_beta, \
buf_y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
GENFRONT( xpbym_md )
#endif
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_oapi.h 0000664 0000000 0000000 00000005143 14634250137 0021741 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( addm )
GENPROT( copym )
GENPROT( subm )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( axpym )
GENPROT( scal2m )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( scalm )
GENPROT( setm )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( xpbym )
GENPROT( xpbym_md )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_oapi_ba.c 0000664 0000000 0000000 00000003670 14634250137 0022401 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_oapi_ba.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1m_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_oapi_ex.c 0000664 0000000 0000000 00000003666 14634250137 0022440 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_oapi_ex.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l1m_oapi.c"
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_oft_var.h 0000664 0000000 0000000 00000004320 14634250137 0022445 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L1M_OFT_VAR_H
#define BLIS_L1M_OFT_VAR_H
//
// -- Level-3 variant function types -------------------------------------------
//
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_var_oft)) \
( \
obj_t* a, \
obj_t* p, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENTDEF( packm )
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_var_oft)) \
( \
obj_t* p, \
obj_t* a, \
cntx_t* cntx, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENTDEF( unpackm )
#endif
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_tapi.c 0000664 0000000 0000000 00000027137 14634250137 0021750 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the typed API macros.
#ifdef BLIS_ENABLE_TAPI
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, auxker ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC2(ch,opname,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
\
/* When the diagonal of an upper- or lower-stored matrix is unit,
we handle it with a separate post-processing step. */ \
if ( bli_is_upper_or_lower( uplox ) && \
bli_is_unit_diag( diagx ) ) \
{ \
PASTEMAC2(ch,auxker,BLIS_TAPI_EX_SUF) \
( \
diagoffx, \
diagx, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
} \
}
INSERT_GENTFUNC_BASIC( addm, addd )
INSERT_GENTFUNC_BASIC( subm, subd )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC2(ch,opname,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
\
/* When the diagonal of an upper- or lower-stored matrix is unit,
we handle it with a separate post-processing step. */ \
if ( bli_is_upper_or_lower( uplox ) && \
bli_is_unit_diag( diagx ) ) \
{ \
doff_t diagoffy = diagoffx; \
ctype* one = PASTEMAC(ch,1); \
\
if ( bli_does_trans( transx ) ) \
bli_negate_diag_offset( &diagoffy ); \
\
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffy, \
m, \
n, \
one, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
} \
}
INSERT_GENTFUNC_BASIC0( copym )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If alpha is zero, then the entire operation is a no-op. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC2(ch,opname,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
alpha, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
\
/* When the diagonal of an upper- or lower-stored matrix is unit,
we handle it with a separate post-processing step. */ \
if ( bli_is_upper_or_lower( uplox ) && \
bli_is_unit_diag( diagx ) ) \
{ \
PASTEMAC2(ch,axpyd,BLIS_TAPI_EX_SUF) \
( \
diagoffx, \
diagx, \
transx, \
m, \
n, \
alpha, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
} \
}
INSERT_GENTFUNC_BASIC0( axpym )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* If alpha is zero, then we set the output matrix to zero. This
seemingly minor optimization is important because it will clear
any NaNs and Infs in x that would otherwise propogate. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffx, \
diagx, \
uplox, \
m, \
n, \
alpha, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
return; \
} \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC2(ch,opname,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
alpha, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
\
/* When the diagonal of an upper- or lower-stored matrix is unit,
we handle it with a separate post-processing step. */ \
if ( bli_is_upper_or_lower( uplox ) && \
bli_is_unit_diag( diagx ) ) \
{ \
doff_t diagoffy = diagoffx; \
\
if ( bli_does_trans( transx ) ) \
bli_negate_diag_offset( &diagoffy ); \
\
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffy, \
m, \
n, \
alpha, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
} \
}
INSERT_GENTFUNC_BASIC0( scal2m )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC2(ch,opname,_unb_var1) \
( \
conjalpha, \
diagoffx, \
diagx, \
uplox, \
m, \
n, \
alpha, \
x, rs_x, cs_x, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( scalm )
INSERT_GENTFUNC_BASIC0( setm )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* If beta is zero, then the operation reduces to copym. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
PASTEMAC2(ch,copym,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
\
return; \
} \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC2(ch,opname,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
beta, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
\
/* When the diagonal of an upper- or lower-stored matrix is unit,
we handle it with a separate post-processing step. */ \
if ( bli_is_upper_or_lower( uplox ) && \
bli_is_unit_diag( diagx ) ) \
{ \
PASTEMAC2(ch,xpbyd,BLIS_TAPI_EX_SUF) \
( \
diagoffx, \
diagx, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
beta, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
} \
}
INSERT_GENTFUNC_BASIC0( xpbym )
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
\
void PASTEMAC3(chx,chy,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype_x* x, inc_t rs_x, inc_t cs_x, \
ctype_y* beta, \
ctype_y* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
if ( bli_zero_dim2( m, n ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* If beta is zero, then the operation reduces to copym. */ \
if ( PASTEMAC(chy,eq0)( *beta ) ) \
{ \
PASTEMAC2(chx,chy,castm) \
( \
transx, \
m, \
n, \
x, rs_x, cs_x, \
y, rs_y, cs_y \
); \
\
return; \
} \
\
/* Invoke the helper variant, which loops over the appropriate kernel
to implement the current operation. */ \
PASTEMAC3(chx,chy,opname,_unb_var1) \
( \
diagoffx, \
diagx, \
uplox, \
transx, \
m, \
n, \
x, rs_x, cs_x, \
beta, \
y, rs_y, cs_y, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC2_BASIC0( xpbym_md )
INSERT_GENTFUNC2_MIXDP0( xpbym_md )
#endif
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_tapi.h 0000664 0000000 0000000 00000007774 14634250137 0021762 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( addm )
INSERT_GENTPROT_BASIC0( copym )
INSERT_GENTPROT_BASIC0( subm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( axpym )
INSERT_GENTPROT_BASIC0( scal2m )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( scalm )
INSERT_GENTPROT_BASIC0( setm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( xpbym )
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype_x* x, inc_t rs_x, inc_t cs_x, \
ctype_y* beta, \
ctype_y* y, inc_t rs_y, inc_t cs_y \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT2_BASIC0( xpbym_md )
INSERT_GENTPROT2_MIXDP0( xpbym_md )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_tapi_ba.c 0000664 0000000 0000000 00000003666 14634250137 0022413 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_tapi_ba.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1m_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_tapi_ex.c 0000664 0000000 0000000 00000003664 14634250137 0022443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_tapi_ex.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l1m_tapi.c"
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_unb_var1.c 0000664 0000000 0000000 00000035444 14634250137 0022530 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
uplo_t uplox_eff; \
conj_t conjx; \
dim_t n_iter; \
dim_t n_elem_max; \
inc_t ldx, incx; \
inc_t ldy, incy; \
dim_t ij0, n_shift; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_uplo_2m \
( \
diagoffx, diagx, transx, \
uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
&uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
&ij0, &n_shift \
); \
\
if ( bli_is_zeros( uplox_eff ) ) return; \
\
/* Extract the conjugation component from the transx parameter. */ \
conjx = bli_extract_conj( transx ); \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = n_elem_max; \
\
ctype* x1 = x + (j )*ldx + (0 )*incx; \
ctype* y1 = y + (j )*ldy + (0 )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
} \
else \
{ \
if ( bli_is_upper( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \
ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
} \
else if ( bli_is_lower( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
const dim_t n_elem = n_elem_max - offi; \
\
ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \
ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( addm_unb_var1, addv, BLIS_ADDV_KER )
INSERT_GENTFUNC_BASIC2( copym_unb_var1, copyv, BLIS_COPYV_KER )
INSERT_GENTFUNC_BASIC2( subm_unb_var1, subv, BLIS_SUBV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
uplo_t uplox_eff; \
conj_t conjx; \
dim_t n_iter; \
dim_t n_elem_max; \
inc_t ldx, incx; \
inc_t ldy, incy; \
dim_t ij0, n_shift; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_uplo_2m \
( \
diagoffx, diagx, transx, \
uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
&uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
&ij0, &n_shift \
); \
\
if ( bli_is_zeros( uplox_eff ) ) return; \
\
/* Extract the conjugation component from the transx parameter. */ \
conjx = bli_extract_conj( transx ); \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = n_elem_max; \
\
ctype* x1 = x + (j )*ldx + (0 )*incx; \
ctype* y1 = y + (j )*ldy + (0 )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
alpha, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
} \
else \
{ \
if ( bli_is_upper( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \
ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
alpha, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
} \
else if ( bli_is_lower( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
const dim_t n_elem = n_elem_max - offi; \
\
ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \
ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
alpha, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( axpym_unb_var1, axpyv, BLIS_AXPYV_KER )
INSERT_GENTFUNC_BASIC2( scal2m_unb_var1, scal2v, BLIS_SCAL2V_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
uplo_t uplox_eff; \
dim_t n_iter; \
dim_t n_elem_max; \
inc_t ldx, incx; \
dim_t ij0, n_shift; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_uplo_1m \
( \
diagoffx, diagx, \
uplox, m, n, rs_x, cs_x, \
&uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \
&ij0, &n_shift \
); \
\
if ( bli_is_zeros( uplox_eff ) ) return; \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = n_elem_max; \
\
ctype* x1 = x + (j )*ldx + (0 )*incx; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjalpha, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
} \
} \
else \
{ \
if ( bli_is_upper( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjalpha, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
} \
} \
else if ( bli_is_lower( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
const dim_t n_elem = n_elem_max - offi; \
\
ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjalpha, \
n_elem, \
alpha, \
x1, incx, \
cntx \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER )
INSERT_GENTFUNC_BASIC2( setm_unb_var1, setv, BLIS_SETV_KER )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, kername, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
uplo_t uplox_eff; \
conj_t conjx; \
dim_t n_iter; \
dim_t n_elem_max; \
inc_t ldx, incx; \
inc_t ldy, incy; \
dim_t ij0, n_shift; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_uplo_2m \
( \
diagoffx, diagx, transx, \
uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
&uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
&ij0, &n_shift \
); \
\
if ( bli_is_zeros( uplox_eff ) ) return; \
\
/* Extract the conjugation component from the transx parameter. */ \
conjx = bli_extract_conj( transx ); \
\
/* Query the kernel needed for this operation. */ \
PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( bli_is_dense( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = n_elem_max; \
\
ctype* x1 = x + (j )*ldx + (0 )*incx; \
ctype* y1 = y + (j )*ldy + (0 )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
} \
} \
else \
{ \
if ( bli_is_upper( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \
ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
} \
} \
else if ( bli_is_lower( uplox_eff ) ) \
{ \
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
const dim_t n_elem = n_elem_max - offi; \
\
ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \
ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \
\
/* Invoke the kernel with the appropriate parameters. */ \
f \
( \
conjx, \
n_elem, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC2( xpbym_unb_var1, xpbyv, BLIS_XPBYV_KER )
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
\
void PASTEMAC2(chx,chy,opname) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype_x* x, inc_t rs_x, inc_t cs_x, \
ctype_y* beta, \
ctype_y* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
uplo_t uplox_eff; \
dim_t n_iter; \
dim_t n_elem_max; \
inc_t ldx, incx; \
inc_t ldy, incy; \
dim_t ij0, n_shift; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_uplo_2m \
( \
diagoffx, diagx, transx, \
uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
&uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
&ij0, &n_shift \
); \
\
/* Extract the conjugation component from the transx parameter. */ \
/*conjx = bli_extract_conj( transx );*/ \
\
/* Handle dense and upper/lower storage cases separately. */ \
if ( PASTEMAC(chy,eq1)( *beta ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
const dim_t n_elem = n_elem_max; \
\
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \
ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \
\
for ( dim_t i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(chx,chy,adds)( x1[i], y1[i] ); \
} \
} \
} \
else \
{ \
const dim_t n_elem = n_elem_max; \
\
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \
ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \
\
ctype_x* restrict chi1 = x1; \
ctype_y* restrict psi1 = y1; \
\
for ( dim_t i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \
\
chi1 += incx; \
psi1 += incy; \
} \
} \
} \
} \
else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
const dim_t n_elem = n_elem_max; \
\
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \
ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \
\
for ( dim_t i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC3(chx,chy,chy,xpbys)( x1[i], *beta, y1[i] ); \
} \
} \
} \
else \
{ \
const dim_t n_elem = n_elem_max; \
\
for ( dim_t j = 0; j < n_iter; ++j ) \
{ \
ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \
ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \
\
ctype_x* restrict chi1 = x1; \
ctype_y* restrict psi1 = y1; \
\
for ( dim_t i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \
\
chi1 += incx; \
psi1 += incy; \
} \
} \
} \
} \
}
INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 )
INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/1m/bli_l1m_unb_var1.h 0000664 0000000 0000000 00000010010 14634250137 0022513 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,_unb_var1) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( addm )
INSERT_GENTPROT_BASIC0( copym )
INSERT_GENTPROT_BASIC0( subm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,_unb_var1) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( axpym )
INSERT_GENTPROT_BASIC0( scal2m )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,_unb_var1) \
( \
conj_t conjalpha, \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t rs_x, inc_t cs_x, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( scalm )
INSERT_GENTPROT_BASIC0( setm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,_unb_var1) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype* beta, \
ctype* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( xpbym )
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
\
void PASTEMAC3(chx,chy,opname,_unb_var1) \
( \
doff_t diagoffx, \
diag_t diagx, \
uplo_t uplox, \
trans_t transx, \
dim_t m, \
dim_t n, \
ctype_x* x, inc_t rs_x, inc_t cs_x, \
ctype_y* beta, \
ctype_y* y, inc_t rs_y, inc_t cs_y, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT2_BASIC0( xpbym_md )
INSERT_GENTPROT2_MIXDP0( xpbym_md )
cython-blis-1.0.0/blis/_src/frame/1m/other/ 0000775 0000000 0000000 00000000000 14634250137 0020357 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1m/other/bli_scalm.h 0000664 0000000 0000000 00000003246 14634250137 0022462 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_scalm_cntl.h"
cython-blis-1.0.0/blis/_src/frame/1m/other/bli_scalm_cntl.c 0000664 0000000 0000000 00000004205 14634250137 0023471 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cntl_t* bli_scalm_cntl_create_node
(
void_fp var_func,
cntl_t* sub_node
)
{
cntl_t* cntl;
// It's important that we set the bszid field to BLIS_NO_PART to indicate
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_create_node
(
BLIS_NOID,
BLIS_NO_PART,
var_func,
NULL,
sub_node
);
return cntl;
}
cython-blis-1.0.0/blis/_src/frame/1m/other/bli_scalm_cntl.h 0000664 0000000 0000000 00000003355 14634250137 0023503 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
cntl_t* bli_scalm_cntl_create_node
(
void_fp var_func,
cntl_t* sub_node
);
cython-blis-1.0.0/blis/_src/frame/1m/other/bli_scalm_int.c 0000664 0000000 0000000 00000006434 14634250137 0023331 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T scalm_fp
typedef void (*FUNCPTR_T)( obj_t* alpha,
obj_t* x,
cntx_t* cntx );
static FUNCPTR_T vars[1][3] =
{
// unblocked optimized unblocked blocked
{ bli_scalm_ex, bli_scalm_ex, NULL }
};
void bli_scalm_int( obj_t* alpha,
obj_t* x,
cntx_t* cntx,
scalm_t* cntl )
{
//obj_t x_local;
varnum_t n;
impl_t i;
FUNCPTR_T f;
// Return early if one of the matrix operands has a zero dimension.
if ( bli_obj_has_zero_dim( x ) ) return;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_scalm_check( alpha, x );
// First check if we are to skip this operation.
if ( bli_cntl_is_noop( cntl ) ) return;
// Return early if both alpha and the scalar attached to x are unit.
if ( bli_obj_equals( alpha, &BLIS_ONE ) &&
bli_obj_scalar_equals( x, &BLIS_ONE ) ) return;
//
// This code has been disabled since we've now added the alpha
// parameter back to the object interface to the underlying
// scalm variant.
//
// Alias x to x_local so we can apply alpha if it is non-unit.
//bli_obj_alias_to( *x, x_local );
// If alpha is non-unit, apply it to the scalar attached to x.
//if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
//{
// bli_obj_scalar_apply_scalar( alpha, &x_local );
//}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( alpha,
x,
cntx );
}
cython-blis-1.0.0/blis/_src/frame/1m/other/bli_scalm_int.h 0000664 0000000 0000000 00000003426 14634250137 0023334 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_scalm_int( obj_t* alpha,
obj_t* x,
cntx_t* cntx,
scalm_t* cntl );
cython-blis-1.0.0/blis/_src/frame/1m/packm/ 0000775 0000000 0000000 00000000000 14634250137 0020331 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm.h 0000664 0000000 0000000 00000004222 14634250137 0022423 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_packm_alloc.h"
#include "bli_packm_cntl.h"
#include "bli_packm_check.h"
#include "bli_packm_init.h"
#include "bli_packm_int.h"
#include "bli_packm_scalar.h"
#include "bli_packm_part.h"
#include "bli_packm_struc_cxk.h"
#include "bli_packm_struc_cxk_1er.h"
#include "bli_packm_cxk.h"
#include "bli_packm_cxk_1er.h"
// Mixed datatype support.
#ifdef BLIS_ENABLE_GEMM_MD
#include "bli_packm_struc_cxk_md.h"
#endif
#include "bli_packm_blk_var1.h"
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_alloc.c 0000664 0000000 0000000 00000006744 14634250137 0023603 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void* bli_packm_alloc
(
siz_t size_needed,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Query the pack buffer type from the control tree node.
packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl );
return bli_packm_alloc_ex
(
size_needed,
pack_buf_type,
rntm,
cntl,
thread
);
}
void* bli_packm_alloc_ex
(
siz_t size_needed,
packbuf_t pack_buf_type,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Query the address of the mem_t entry within the control tree node.
mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
mem_t* local_mem_p;
mem_t local_mem_s;
siz_t cntl_mem_size = 0;
if ( bli_mem_is_alloc( cntl_mem_p ) )
cntl_mem_size = bli_mem_size( cntl_mem_p );
if ( cntl_mem_size < size_needed )
{
if ( bli_thread_am_ochief( thread ) )
{
// The chief thread releases the existing block associated with
// the mem_t entry in the control tree, and then re-acquires a
// new block, saving the associated mem_t entry to local_mem_s.
if ( bli_mem_is_alloc( cntl_mem_p ) )
{
bli_pba_release
(
rntm,
cntl_mem_p
);
}
bli_pba_acquire_m
(
rntm,
size_needed,
pack_buf_type,
&local_mem_s
);
}
// Broadcast the address of the chief thread's local mem_t entry to
// all threads.
local_mem_p = bli_thread_broadcast( thread, &local_mem_s );
// Save the chief thread's local mem_t entry to the mem_t field in
// this thread's control tree node.
*cntl_mem_p = *local_mem_p;
// Barrier so that the master thread doesn't return from the function
// before we are done reading.
bli_thread_barrier( thread );
}
return bli_mem_buffer( cntl_mem_p );
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_alloc.h 0000664 0000000 0000000 00000003753 14634250137 0023605 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS void* bli_packm_alloc
(
siz_t size_needed,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
(
siz_t size_needed,
packbuf_t pack_buf_type,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_blk_var1.c 0000664 0000000 0000000 00000026376 14634250137 0024215 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
/* float (0) scomplex (1) double (2) dcomplex (3) */
// 0000 row/col panels
{ { bli_spackm_struc_cxk, bli_cpackm_struc_cxk,
bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } },
// 0001 row/col panels: 1m-expanded (1e)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
// 0010 row/col panels: 1m-reordered (1r)
{ { NULL, bli_cpackm_struc_cxk_1er,
NULL, bli_zpackm_struc_cxk_1er, } },
};
static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
void bli_packm_blk_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Extract various fields from the control tree.
pack_t schema = bli_cntl_packm_params_pack_schema( cntl );
bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl );
bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );
// Every thread initializes p and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
// entry in the control tree node). Return early if no packing is required.
if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) )
return;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_int_check( c, p, cntx );
num_t dt_c = bli_obj_dt( c );
dim_t dt_c_size = bli_dt_size( dt_c );
num_t dt_p = bli_obj_dt( p );
dim_t dt_p_size = bli_dt_size( dt_p );
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
conj_t conjc = bli_obj_conj_status( c );
dim_t iter_dim = bli_obj_length( p );
dim_t panel_len_full = bli_obj_width( p );
dim_t panel_len_max = bli_obj_padded_width( p );
char* c_cast = bli_obj_buffer_at_off( c );
inc_t incc = bli_obj_row_stride( c );
inc_t ldc = bli_obj_col_stride( c );
dim_t panel_dim_off = bli_obj_row_off( c );
dim_t panel_len_off = bli_obj_col_off( c );
char* p_cast = bli_obj_buffer( p );
inc_t ldp = bli_obj_col_stride( p );
inc_t is_p = bli_obj_imag_stride( p );
dim_t panel_dim_max = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
doff_t diagoffc_inc = ( doff_t )panel_dim_max;
obj_t kappa_local;
char* kappa_cast = bli_packm_scalar( &kappa_local, p );
// we use the default lookup table to determine the right func_t
// for the current schema.
func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
// Query the datatype-specific function pointer from the func_t object.
packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
// For mixed-precision gemm, select the proper kernel (only dense panels).
if ( dt_c != dt_p )
{
packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
}
// Query the address of the packm params field of the obj_t. The user might
// have set this field in order to specify a custom packm kernel.
packm_blk_var1_params_t* params = bli_obj_pack_params( c );
if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
{
// Query the user-provided packing kernel from the obj_t. If provided,
// this overrides the kernel determined above.
packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
}
/* Compute the total number of iterations we'll need. */
dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */
dim_t ic0, ip0;
doff_t ic_inc, ip_inc;
if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) ||
( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) )
{
ic0 = (n_iter - 1) * panel_dim_max;
ic_inc = -panel_dim_max;
ip0 = n_iter - 1;
ip_inc = -1;
}
else
{
ic0 = 0;
ic_inc = panel_dim_max;
ip0 = 0;
ip_inc = 1;
}
// Query the number of threads and thread ids from the current thread's
// packm thrinfo_t node.
const dim_t nt = bli_thread_n_way( thread );
const dim_t tid = bli_thread_work_id( thread );
// Determine the thread range and increment using the current thread's
// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
// will depend on whether slab or round-robin partitioning was requested
// at configure-time.
dim_t it_start, it_end, it_inc;
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );
char* p_begin = p_cast;
// Iterate over every logical micropanel in the source matrix.
for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter;
ic += ic_inc, ip += ip_inc, it += 1 )
{
dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic );
dim_t panel_dim_off_i = panel_dim_off + ic;
doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc;
char* c_begin = c_cast + (ic )*incc*dt_c_size;
inc_t p_inc = ps_p;
// NOTE: We MUST use round-robin partitioning when packing
// micropanels of a triangular matrix. Hermitian/symmetric
// and general packing may use slab or round-robin, depending
// on which was selected at configure-time.
// The definition of bli_packm_my_iter() will depend on whether slab
// or round-robin partitioning was requested at configure-time.
bool my_iter = bli_is_triangular( strucc )
? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt )
: bli_packm_my_iter ( it, it_start, it_end, tid, nt );
if ( bli_is_triangular( strucc ) &&
bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) )
{
// This case executes if the panel belongs to a triangular
// matrix AND is completely unstored (ie: zero). If the panel
// is unstored, we do nothing. (Notice that we don't even
// increment p_begin.)
continue;
}
else if ( bli_is_triangular( strucc ) &&
bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
{
// This case executes if the panel belongs to a triangular
// matrix AND is diagonal-intersecting. Notice that we
// cannot bury the following conditional logic into
// packm_struc_cxk() because we need to know the value of
// panel_len_max_i so we can properly increment p_inc.
// Sanity check. Diagonals should not intersect the short end of
// a micro-panel. If they do, then somehow the constraints on
// cache blocksizes being a whole multiple of the register
// blocksizes was somehow violated.
if ( diagoffc_i < 0 )
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
dim_t panel_off_i;
dim_t panel_len_i;
dim_t panel_len_max_i;
if ( bli_is_lower( uploc ) )
{
panel_off_i = 0;
panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i;
panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max,
panel_len_max );
}
else // if ( bli_is_upper( uploc ) )
{
panel_off_i = bli_abs( diagoffc_i );
panel_len_i = panel_len_full - panel_off_i;
panel_len_max_i = panel_len_max - panel_off_i;
}
dim_t panel_len_off_i = panel_off_i + panel_len_off;
char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size;
char* p_use = p_begin;
// We need to re-compute the imaginary stride as a function of
// panel_len_max_i since triangular packed matrices have panels
// of varying lengths. NOTE: This imaginary stride value is
// only referenced by the packm kernels for induced methods.
inc_t is_p_use = ldp * panel_len_max_i;
// We nudge the imaginary stride up by one if it is odd.
is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );
if ( my_iter )
{
packm_ker_cast( strucc,
diagc,
uploc,
conjc,
schema,
invdiag,
panel_dim_i,
panel_len_i,
panel_dim_max,
panel_len_max_i,
panel_dim_off_i,
panel_len_off_i,
kappa_cast,
c_use, incc, ldc,
p_use, ldp,
is_p_use,
cntx,
params );
}
// NOTE: This value is usually LESS than ps_p because triangular
// matrices usually have several micro-panels that are shorter
// than a "full" micro-panel.
p_inc = is_p_use;
}
else
{
// This case executes if the panel is either dense, or belongs
// to a Hermitian or symmetric matrix, which includes stored,
// unstored, and diagonal-intersecting panels.
if ( my_iter )
{
packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
diagc,
uploc,
conjc,
schema,
invdiag,
panel_dim_i,
panel_len_full,
panel_dim_max,
panel_len_max,
panel_dim_off_i,
panel_len_off,
kappa_cast,
c_begin, incc, ldc,
p_begin, ldp, is_p,
cntx,
params );
}
}
p_begin += p_inc*dt_p_size;
}
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_blk_var1.h 0000664 0000000 0000000 00000004147 14634250137 0024212 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// packm params types.
//
typedef struct
{
// Type of C Type of P
packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
} packm_blk_var1_params_t;
//
// Prototype object-based interfaces.
//
BLIS_EXPORT_BLIS void bli_packm_blk_var1
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* t
);
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_check.c 0000664 0000000 0000000 00000005402 14634250137 0023554 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packm_init_check
(
obj_t* a,
obj_t* p,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
// Check control tree pointer.
// NOTE: We can't check the control tree because we interpret a NULL
// value (in bli_packm_int()) as a request to skip the operation.
//e_val = bli_check_valid_cntl( ( void* )cntl );
//bli_check_error_code( e_val );
}
void bli_packm_int_check
(
obj_t* a,
obj_t* p,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( p );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_conformal_dims( a, p );
bli_check_error_code( e_val );
// Check control tree pointer.
// NOTE: We can't check the control tree because we interpret a NULL
// value (in bli_packm_int()) as a request to skip the operation.
//e_val = bli_check_valid_cntl( ( void* )cntl );
//bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_check.h 0000664 0000000 0000000 00000003514 14634250137 0023563 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packm_init_check
(
obj_t* a,
obj_t* p,
cntx_t* cntx
);
void bli_packm_int_check
(
obj_t* a,
obj_t* p,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_cntl.c 0000664 0000000 0000000 00000006225 14634250137 0023443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
bszid_t bmid_m,
bszid_t bmid_n,
bool does_invert_diag,
bool rev_iter_if_upper,
bool rev_iter_if_lower,
pack_t pack_schema,
packbuf_t pack_buf_type,
cntl_t* sub_node
)
{
cntl_t* cntl;
packm_params_t* params;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_packm_cntl_create_node(): " );
#endif
// Allocate a packm_params_t struct.
params = bli_sba_acquire( rntm, sizeof( packm_params_t ) );
// Initialize the packm_params_t struct.
params->size = sizeof( packm_params_t );
params->bmid_m = bmid_m;
params->bmid_n = bmid_n;
params->does_invert_diag = does_invert_diag;
params->rev_iter_if_upper = rev_iter_if_upper;
params->rev_iter_if_lower = rev_iter_if_lower;
params->pack_schema = pack_schema;
params->pack_buf_type = pack_buf_type;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_packm_cntl_create_node(): " );
#endif
// It's important that we set the bszid field to BLIS_NO_PART to indicate
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_create_node
(
rntm,
BLIS_NOID,
BLIS_NO_PART,
var_func,
params,
sub_node
);
return cntl;
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_cntl.h 0000664 0000000 0000000 00000007030 14634250137 0023443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct packm_params_s
{
uint64_t size; // size field must be present and come first.
bszid_t bmid_m;
bszid_t bmid_n;
bool does_invert_diag;
bool rev_iter_if_upper;
bool rev_iter_if_lower;
pack_t pack_schema;
packbuf_t pack_buf_type;
};
typedef struct packm_params_s packm_params_t;
BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
}
BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n;
}
BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag;
}
BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper;
}
BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower;
}
BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema;
}
BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl )
{
packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type;
}
// -----------------------------------------------------------------------------
cntl_t* bli_packm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
bszid_t bmid_m,
bszid_t bmid_n,
bool does_invert_diag,
bool rev_iter_if_upper,
bool rev_iter_if_lower,
pack_t pack_schema,
packbuf_t pack_buf_type,
cntl_t* sub_node
);
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_cxk.c 0000664 0000000 0000000 00000015544 14634250137 0023274 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( f != NULL ) \
{ \
/* Under normal circumstances, the packm kernel will copy over a
panel_dim x panel_len submatrix of A into P. However, the kernel
now handles zero-filling at edge cases, which typically consist of
the outer (panel_dim_max - panel_dim) rows or columns of the
micropanel. (Note that these rows/columns correspond to values
beyond the edge of matrix A.) The kernel intrinsically knows its
own panel_dim_max, since that corresponds to the packm micropanel's
normal width (corresponding to the gemm microkernel's register
blocksize (mr or nr). However, we *do* need to pass in panel_len_max
because the bottom-right edge case of trsm_lu will need all
elements above the extended diagonal and beyond (to the right of)
the bottom-right element to be initialized to zero so the trsm
portion of the computational kernel will operate with zeros for
those iterations.
For example, if trsm_lu is executed on an 10x10 triangular matrix,
and the gemmtrsm kernel uses MR = 6, the computation will begin
with the edge case, which is the bottom-right 4x4 upper triangular
matrix. Code in bli_packm_tri_cxk() will extend the diagonal as
identity into the remaining portion of the micropanel. But before
that happens, the packm kernel must have set the 0's added in
step (3) below.
packm kernel packm kernel packm kernel packm_tri_cxk
step 1: step 2: step 3: step 4:
x x x x . . x x x x . . x x x x 0 0 x x x x 0 0
? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0
? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0
? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
. . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
x Copied from A; valid element.
? Copied from A, but value is unknown and unused.
. Uninitialized.
0 Initialized to zero.
1 Initialized to one.
NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
to zero. This is not needed to support trsm, but rather to
support trmm. (Both use the same packing format and code.)
In this case, panel_dim will be 4 because four rows of data are
copied from A, panel_len will be 4 because those four rows span
four columns of A, and panel_len_max will be 6 because there are a
total of 6 columns that can be written to in the packed micropanel,
2 of which lie beyond the values copied from A. */ \
f \
( \
conja, \
schema, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
a, inca, lda, \
p, ldp, \
cntx \
); \
} \
else \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
( trans_t )conja, \
panel_dim, \
panel_len, \
kappa, \
a, inca, lda, \
p, 1, ldp, \
cntx, \
/* The rntm_t* can safely be NULL as long as it's not used by
scal2m_ex(). */ \
NULL \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - panel_dim; \
const dim_t n_edge = panel_len_max; \
ctype* restrict p_edge = p + (i )*1; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - panel_len; \
ctype* restrict p_edge = p + (j )*ldp; \
\
PASTEMAC(ch,set0s_mxn) \
( \
m_edge, \
n_edge, \
p_edge, 1, ldp \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC0( packm_cxk )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_cxk.h 0000664 0000000 0000000 00000004107 14634250137 0023272 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_cxk )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_cxk_1er.c 0000664 0000000 0000000 00000010324 14634250137 0024032 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
) \
{ \
/* Note that we use panel_dim_max, not panel_dim, to query the packm
kernel function pointer. This means that we always use the same
kernel, even for edge cases. */ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim_max; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the packm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( f != NULL ) \
{ \
f \
( \
conja, \
schema, \
panel_dim, \
panel_len, \
panel_len_max, \
kappa, \
a, inca, lda, \
p, ldp, \
cntx \
); \
} \
else \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
PASTEMAC(ch,scal21ms_mxn) \
( \
schema, \
conja, \
panel_dim, \
panel_len, \
kappa, \
a, inca, lda, \
p, 1, ldp, ldp \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
const dim_t offm = panel_dim; \
const dim_t offn = 0; \
const dim_t m_edge = panel_dim_max - panel_dim; \
const dim_t n_edge = panel_len_max; \
\
PASTEMAC(ch,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, 1, ldp, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
const dim_t offm = 0; \
const dim_t offn = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - panel_len; \
\
PASTEMAC(ch,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, 1, ldp, ldp \
); \
} \
} \
}
INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_cxk_1er.h 0000664 0000000 0000000 00000004137 14634250137 0024044 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
pack_t schema, \
dim_t panel_dim, \
dim_t panel_dim_max, \
dim_t panel_len, \
dim_t panel_len_max, \
ctype* kappa, \
ctype* a, inc_t inca, inc_t lda, \
ctype* p, inc_t ldp, \
cntx_t* cntx \
);
INSERT_GENTPROTCO_BASIC0( packm_cxk_1er )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_init.c 0000664 0000000 0000000 00000017034 14634250137 0023446 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
bool bli_packm_init
(
obj_t* c,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_init_once();
// The purpose of packm_init() is to initialize an object P so that
// a source object A can be packed into P via one of the packm
// implementations. This initialization precedes the acquisition of a
// suitable block of memory from the memory allocator (if such a block
// of memory has not already been allocated previously).
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_init_check( c, p, cntx );
// We begin by copying the fields of A.
bli_obj_alias_to( c, p );
// If the object is marked as being filled with zeros, then we can skip
// the packm operation entirely and alias.
if ( bli_obj_is_zeros( c ) )
return false;
// Extract various fields from the control tree.
bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl );
bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl );
pack_t schema = bli_cntl_packm_params_pack_schema( cntl );
num_t dt_tar = bli_obj_target_dt( c );
num_t dt_scalar = bli_obj_scalar_dt( c );
dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
// Typecast the internal scalar value to the target datatype.
// Note that if the typecasting is needed, this must happen BEFORE we
// change the datatype of P to reflect the target_dt.
if ( dt_scalar != dt_tar )
{
bli_obj_scalar_cast_to( dt_tar, p );
}
// Update the storage datatype of P to be the target datatype of A.
bli_obj_set_dt( dt_tar, p );
bli_obj_set_elem_size( bli_dt_size( dt_tar ), p );
// Store the pack schema to the object.
bli_obj_set_pack_schema( schema, p );
// Clear the conjugation field from the object since matrix packing
// in BLIS is deemed to take care of all conjugation necessary.
bli_obj_set_conj( BLIS_NO_CONJUGATE, p );
// Since we are packing micropanels, mark P as dense.
bli_obj_set_uplo( BLIS_DENSE, p );
// Reset the view offsets to (0,0).
bli_obj_set_offs( 0, 0, p );
// Compute the dimensions padded by the dimension multiples. These
// dimensions will be the dimensions of the packed matrices, including
// zero-padding, and will be used by the macro- and micro-kernels.
// We compute them by starting with the effective dimensions of A (now
// in P) and aligning them to the dimension multiples (typically equal
// to register blocksizes). This does waste a little bit of space for
// level-2 operations, but that's okay with us.
dim_t m_p = bli_obj_length( p );
dim_t n_p = bli_obj_width( p );
dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def );
dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def );
// Save the padded dimensions into the packed object. It is important
// to save these dimensions since they represent the actual dimensions
// of the zero-padded matrix.
bli_obj_set_padded_dims( m_p_pad, n_p_pad, p );
// Now we prepare to compute strides, align them, and compute the
// total number of bytes needed for the packed buffer. Then we use
// that value to acquire an appropriate block of memory from the
// memory allocator.
// Extract the element size for the packed object.
siz_t elem_size_p = bli_obj_elem_size( p );
// The panel dimension (for each datatype) should be equal to the
// default (logical) blocksize multiple in the m dimension.
dim_t m_panel = bmult_m_def;
// The "column stride" of a row-micropanel packed object is interpreted
// as the column stride WITHIN a micropanel. Thus, this is equal to the
// packing (storage) blocksize multiple, which may be equal to the
// default (logical) blocksize multiple).
inc_t cs_p = bmult_m_pack;
// The "row stride" of a row-micropanel packed object is interpreted
// as the row stride WITHIN a micropanel. Thus, it is unit.
inc_t rs_p = 1;
// The "panel stride" of a micropanel packed object is interpreted as
// the distance between the (0,0) element of panel k and the (0,0)
// element of panel k+1. We use the padded width computed above to
// allow for zero-padding (if necessary/desired) along the far end
// of each micropanel (ie: the right edge of the matrix). Zero-padding
// can also occur along the long edge of the last micropanel if the m
// dimension of the matrix is not a whole multiple of MR.
inc_t ps_p = cs_p * n_p_pad;
// As a general rule, we don't want micropanel strides to be odd. There
// are very few instances where this can happen, but we've seen it happen
// more than zero times (such as for certain small problems), and so we
// check for it here.
if ( bli_is_odd( ps_p ) ) ps_p += 1;
// Set the imaginary stride (in units of fundamental elements).
// This is the number of real elements that must be traversed before
// reaching the imaginary part of the packed micropanel. NOTE: the
// imaginary stride is mostly vestigial and left over from the 3m
// and 4m implementations.
inc_t is_p = 1;
// Store the strides and panel dimension in P.
bli_obj_set_strides( rs_p, cs_p, p );
bli_obj_set_imag_stride( is_p, p );
bli_obj_set_panel_dim( m_panel, p );
bli_obj_set_panel_stride( ps_p, p );
bli_obj_set_panel_length( m_panel, p );
bli_obj_set_panel_width( n_p, p );
// Compute the size of the packed buffer.
siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p;
// If the requested size is zero, then we don't need to do any allocation.
if ( size_p == 0 )
return false;
// Update the buffer address in p to point to the buffer associated
// with the mem_t entry acquired from the memory broker (now cached in
// the control tree node).
void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread );
bli_obj_set_buffer( buffer, p );
return true;
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_init.h 0000664 0000000 0000000 00000003472 14634250137 0023454 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS bool bli_packm_init
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_int.c 0000664 0000000 0000000 00000004410 14634250137 0023267 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packm_int
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_init_once();
// Extract the function pointer from the object.
packm_var_oft f = bli_obj_pack_fn( a );
// Barrier so that we know threads are done with previous computation
// with the same packing buffer before starting to pack.
bli_thread_barrier( thread );
// Invoke the variant with kappa_use.
f
(
a,
p,
cntx,
rntm,
cntl,
thread
);
// Barrier so that packing is done before computation.
bli_thread_barrier( thread );
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_int.h 0000664 0000000 0000000 00000003447 14634250137 0023305 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_packm_int
(
obj_t* a,
obj_t* p,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_part.c 0000664 0000000 0000000 00000022041 14634250137 0023443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -- Matrix partitioning ------------------------------------------------------
void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj )
{
dim_t m, n;
// For now, we only support acquiring the middle subpartition.
if ( requested_part != BLIS_SUBPART1 )
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
// Partitioning top-to-bottom through packed column panels (which are
// row-stored) is not yet supported.
if ( bli_obj_is_col_packed( obj ) )
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
// Query the dimensions of the parent object.
m = bli_obj_length( obj );
n = bli_obj_width( obj );
// Foolproofing: do not let b exceed what's left of the m dimension at
// row offset i.
if ( b > m - i ) b = m - i;
// Begin by copying the info, elem size, buffer, row stride, and column
// stride fields of the parent object. Note that this omits copying view
// information because the new partition will have its own dimensions
// and offsets.
bli_obj_init_subpart_from( obj, sub_obj );
// Modify offsets and dimensions of requested partition.
bli_obj_set_dims( b, n, sub_obj );
// Tweak the padded length of the subpartition to trick the underlying
// implementation into only zero-padding for the narrow submatrix of
// interest. Usually, the value we want is b (for non-edge cases), but
// at the edges, we want the remainder of the mem_t region in the m
// dimension. Edge cases are defined as occurring when i + b is exactly
// equal to the inherited sub-object's length (which happens since the
// determine_blocksize function would have returned a smaller value of
// b for the edge iteration). In these cases, we arrive at the new
// packed length by simply subtracting off i.
{
dim_t m_pack_max = bli_obj_padded_length( sub_obj );
dim_t m_pack_cur;
if ( i + b == m ) m_pack_cur = m_pack_max - i;
else m_pack_cur = b;
bli_obj_set_padded_length( m_pack_cur, sub_obj );
}
// Translate the desired offsets to a panel offset and adjust the
// buffer pointer of the subpartition object.
{
char* buf_p = bli_obj_buffer( sub_obj );
siz_t elem_size = bli_obj_elem_size( sub_obj );
dim_t off_to_panel = bli_packm_offset_to_panel_for( i, sub_obj );
buf_p = buf_p + elem_size * off_to_panel;
bli_obj_set_buffer( buf_p, sub_obj );
}
}
void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj )
{
dim_t m, n;
// Check parameters.
//if ( bli_error_checking_is_enabled() )
// bli_packm_acquire_mpart_l2r_check( requested_part, j, b, obj, sub_obj );
// For now, we only support acquiring the middle subpartition.
if ( requested_part != BLIS_SUBPART1 )
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
// Partitioning left-to-right through packed row panels (which are
// column-stored) is not yet supported.
if ( bli_obj_is_row_packed( obj ) )
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
// Query the dimensions of the parent object.
m = bli_obj_length( obj );
n = bli_obj_width( obj );
// Foolproofing: do not let b exceed what's left of the n dimension at
// column offset j.
if ( b > n - j ) b = n - j;
// Begin by copying the info, elem size, buffer, row stride, and column
// stride fields of the parent object. Note that this omits copying view
// information because the new partition will have its own dimensions
// and offsets.
bli_obj_init_subpart_from( obj, sub_obj );
// Modify offsets and dimensions of requested partition.
bli_obj_set_dims( m, b, sub_obj );
// Tweak the padded width of the subpartition to trick the underlying
// implementation into only zero-padding for the narrow submatrix of
// interest. Usually, the value we want is b (for non-edge cases), but
// at the edges, we want the remainder of the mem_t region in the n
// dimension. Edge cases are defined as occurring when j + b is exactly
// equal to the inherited sub-object's width (which happens since the
// determine_blocksize function would have returned a smaller value of
// b for the edge iteration). In these cases, we arrive at the new
// packed width by simply subtracting off j.
{
dim_t n_pack_max = bli_obj_padded_width( sub_obj );
dim_t n_pack_cur;
if ( j + b == n ) n_pack_cur = n_pack_max - j;
else n_pack_cur = b;
bli_obj_set_padded_width( n_pack_cur, sub_obj );
}
// Translate the desired offsets to a panel offset and adjust the
// buffer pointer of the subpartition object.
{
char* buf_p = bli_obj_buffer( sub_obj );
siz_t elem_size = bli_obj_elem_size( sub_obj );
dim_t off_to_panel = bli_packm_offset_to_panel_for( j, sub_obj );
buf_p = buf_p + elem_size * off_to_panel;
bli_obj_set_buffer( buf_p, sub_obj );
}
}
void bli_packm_acquire_mpart_tl2br( subpart_t requested_part,
dim_t ij,
dim_t b,
obj_t* obj,
obj_t* sub_obj )
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p )
{
dim_t panel_off;
if ( bli_obj_pack_schema( p ) == BLIS_PACKED_ROWS )
{
// For the "packed rows" schema, a single row is effectively one
// row panel, and so we use the row offset as the panel offset.
// Then we multiply this offset by the effective panel stride
// (ie: the row stride) to arrive at the desired offset.
panel_off = offmn * bli_obj_row_stride( p );
}
else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_COLUMNS )
{
// For the "packed columns" schema, a single column is effectively one
// column panel, and so we use the column offset as the panel offset.
// Then we multiply this offset by the effective panel stride
// (ie: the column stride) to arrive at the desired offset.
panel_off = offmn * bli_obj_col_stride( p );
}
else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_ROW_PANELS )
{
// For the "packed row panels" schema, the column stride is equal to
// the panel dimension (length). So we can divide it into offmn
// (interpreted as a row offset) to arrive at a panel offset. Then
// we multiply this offset by the panel stride to arrive at the total
// offset to the panel (in units of elements).
panel_off = offmn / bli_obj_col_stride( p );
panel_off = panel_off * bli_obj_panel_stride( p );
// Sanity check.
if ( offmn % bli_obj_col_stride( p ) > 0 ) bli_abort();
}
else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_COL_PANELS )
{
// For the "packed column panels" schema, the row stride is equal to
// the panel dimension (width). So we can divide it into offmn
// (interpreted as a column offset) to arrive at a panel offset. Then
// we multiply this offset by the panel stride to arrive at the total
// offset to the panel (in units of elements).
panel_off = offmn / bli_obj_row_stride( p );
panel_off = panel_off * bli_obj_panel_stride( p );
// Sanity check.
if ( offmn % bli_obj_row_stride( p ) > 0 ) bli_abort();
}
else
{
panel_off = 0;
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
return panel_off;
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_part.h 0000664 0000000 0000000 00000005056 14634250137 0023457 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- Matrix partitioning ------------------------------------------------------
void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj );
void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj );
void bli_packm_acquire_mpart_tl2br( subpart_t requested_part,
dim_t ij,
dim_t b,
obj_t* obj,
obj_t* sub_obj );
dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p );
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_scalar.c 0000664 0000000 0000000 00000006203 14634250137 0023744 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void* bli_packm_scalar( obj_t* kappa, obj_t* p )
{
num_t dt_p = bli_obj_dt( p );
pack_t schema = bli_obj_pack_schema( p );
// The value for kappa we use will depends on whether the scalar
// attached to A has a nonzero imaginary component. If it does,
// then we will apply the scalar during packing to facilitate
// implementing induced complex domain algorithms in terms of
// real domain micro-kernels. (In the aforementioned situation,
// applying a real scalar is easy, but applying a complex one is
// harder, so we avoid the need altogether with the code below.)
if ( bli_obj_scalar_has_nonzero_imag( p ) &&
!bli_is_nat_packed( schema ) )
{
//printf( "applying non-zero imag kappa\n_p" );
// Detach the scalar.
bli_obj_scalar_detach( p, kappa );
// Reset the attached scalar (to 1.0).
bli_obj_scalar_reset( p );
return bli_obj_buffer_for_1x1( dt_p, kappa );
}
// This branch is also for native execution, where we assume that
// the micro-kernel will always apply the alpha scalar of the
// higher-level operation. Thus, we use BLIS_ONE for kappa so
// that the underlying packm implementation does not perform
// any scaling during packing.
else
{
// If the internal scalar of A has only a real component, then
// we will apply it later (in the micro-kernel), and so we will
// use BLIS_ONE to indicate no scaling during packing.
return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE );
}
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_scalar.h 0000664 0000000 0000000 00000003315 14634250137 0023752 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p );
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_struc_cxk.c 0000664 0000000 0000000 00000033455 14634250137 0024515 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk) \
( \
strucc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
panel_dim, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk) \
( \
strucc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
panel_dim, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs; \
dim_t i, j; \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
{ \
c = c + diagoffc * ( doff_t )ldc + \
-diagoffc * ( doff_t )incc; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( diagoffc < 0 ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( bli_is_lower( uploc ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
-diagoffc12 * ( doff_t )incc; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( bli_is_upper( uploc ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
-diagoffc10 * ( doff_t )incc; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
schema, \
p10_dim, \
panel_dim_max, \
p10_len, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
schema, \
p12_dim, \
panel_dim_max, \
p12_len, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
dim_t p11_m = panel_dim; \
dim_t p11_n = panel_dim; \
dim_t j2 = diagoffc_abs; \
ctype* restrict c11 = c + (j2 )*ldc; \
ctype* restrict p11 = p + (j2 )*ldp; \
trans_t transc = ( trans_t )conjc; \
\
PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
transc, \
p11_m, \
p11_n, \
c11, incc, ldc, \
p11, 1, ldp, \
cntx, \
NULL \
); \
\
/* If source matrix c is Hermitian, we have to zero out the
imaginary components of the diagonal of p11 in case the
corresponding elements in c11 were not already zero. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
ctype* restrict pi11 = p11; \
\
for ( i = 0; i < p11_m; ++i ) \
{ \
PASTEMAC(ch,seti0s)( *pi11 ); \
\
pi11 += 1 + ldp; \
} \
} \
\
/* Now that the diagonal has been made explicitly Hermitian
(if applicable), we can now safely scale the stored
triangle specified by uploc. */ \
PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
uploc, \
p11_m, \
p11_n, \
kappa, \
p11, 1, ldp, \
cntx, \
NULL \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
) \
{ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
\
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
\
\
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffc, \
panel_dim, \
panel_len, \
kappa, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
( \
diagoffc, \
panel_dim, \
panel_len, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
uplo_t uplop = uploc; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
\
PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
diagoffc, \
BLIS_NONUNIT_DIAG, \
uplop, \
panel_dim, \
panel_len, \
zero, \
p, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If this panel is an edge case in both panel dimension and length,
then it must be a bottom-right corner case. Set the part of the
diagonal that extends into the zero-padded region to identity.
NOTE: This is actually only necessary when packing for trsm, as
it helps prevent NaNs and Infs from creeping into the computation.
However, we set the region to identity for trmm as well. Those
1.0's end up getting muliplied by the 0.0's in the zero-padded
region of the other matrix, so there is no harm in this. */ \
if ( panel_dim != panel_dim_max && \
panel_len != panel_len_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t i = panel_dim; \
dim_t j = panel_len; \
dim_t m_br = panel_dim_max - i; \
dim_t n_br = panel_len_max - j; \
ctype* p_br = p + (i ) + (j )*ldp; \
\
PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
m_br, \
n_br, \
one, \
p_br, 1, ldp, \
cntx, \
NULL \
); \
} \
}
INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_struc_cxk.h 0000664 0000000 0000000 00000004750 14634250137 0024516 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( packm_struc_cxk )
INSERT_GENTPROT_BASIC0( packm_herm_cxk )
INSERT_GENTPROT_BASIC0( packm_tri_cxk )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_struc_cxk_1er.c 0000664 0000000 0000000 00000035432 14634250137 0025261 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
) \
{ \
/* Handle micro-panel packing based on the structure of the matrix
being packed. */ \
if ( bli_is_general( strucc ) ) \
{ \
/* For micro-panels of general matrices, we can call the pack
kernel front-end directly. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else if ( bli_is_herm_or_symm( strucc ) ) \
{ \
/* Call a helper function for micro-panels of Hermitian/symmetric
matrices. */ \
PASTEMAC(ch,packm_herm_cxk_1er) \
( \
strucc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
panel_dim, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx, \
params \
); \
} \
else /* ( bli_is_triangular( strucc ) ) */ \
{ \
/* Call a helper function for micro-panels of triangular
matrices. */ \
PASTEMAC(ch,packm_tri_cxk_1er) \
( \
strucc, \
diagc, \
uploc, \
conjc, \
schema, \
invdiag, \
panel_dim, \
panel_len, \
panel_dim_max, \
panel_len_max, \
panel_dim_off, \
panel_len_off, \
kappa, \
c, incc, ldc, \
p, ldp, \
is_p, \
cntx, \
params \
); \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
) \
{ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs; \
dim_t j; \
\
/* Handle the case where the micro-panel does NOT intersect the
diagonal separately from the case where it does intersect. */ \
if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
{ \
/* If the current panel is unstored, we need to make a few
adjustments so we refer to the data where it is actually
stored, also taking conjugation into account. (Note this
implicitly assumes we are operating on a dense panel
within a larger symmetric or Hermitian matrix, since a
general matrix would not contain any unstored region.) */ \
if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
{ \
c = c + diagoffc * ( doff_t )ldc + \
-diagoffc * ( doff_t )incc; \
bli_swap_incs( &incc, &ldc ); \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc ); \
} \
\
/* Pack the full panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
} \
else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
{ \
ctype* restrict c10; \
ctype* restrict p10; \
dim_t p10_dim, p10_len; \
inc_t incc10, ldc10; \
doff_t diagoffc10; \
conj_t conjc10; \
\
ctype* restrict c12; \
ctype* restrict p12; \
dim_t p12_dim, p12_len; \
inc_t incc12, ldc12; \
doff_t diagoffc12; \
conj_t conjc12; \
\
\
/* Sanity check. Diagonals should not intersect the short end of
a micro-panel. If they do, then somehow the constraints on
cache blocksizes being a whole multiple of the register
blocksizes was somehow violated. */ \
if ( diagoffc < 0 ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
diagoffc_abs = bli_abs( diagoffc ); \
\
if ( bli_is_lower( uploc ) ) \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs; \
p10 = p; \
c10 = c; \
incc10 = incc; \
ldc10 = ldc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
diagoffc12 = diagoffc_abs - j; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
c12 = c12 + diagoffc12 * ( doff_t )ldc + \
-diagoffc12 * ( doff_t )incc; \
incc12 = ldc; \
ldc12 = incc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc12 ); \
} \
else /* if ( bli_is_upper( uploc ) ) */ \
{ \
p10_dim = panel_dim; \
p10_len = diagoffc_abs + panel_dim; \
diagoffc10 = diagoffc; \
p10 = p; \
c10 = c; \
c10 = c10 + diagoffc10 * ( doff_t )ldc + \
-diagoffc10 * ( doff_t )incc; \
incc10 = ldc; \
ldc10 = incc; \
conjc10 = conjc; \
\
p12_dim = panel_dim; \
p12_len = panel_len - p10_len; \
j = p10_len; \
p12 = p + (j )*ldp; \
c12 = c + (j )*ldc; \
incc12 = incc; \
ldc12 = ldc; \
conjc12 = conjc; \
\
if ( bli_is_hermitian( strucc ) ) \
bli_toggle_conj( &conjc10 ); \
} \
\
/* Pack to p10. For upper storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc10, \
schema, \
p10_dim, \
panel_dim_max, \
p10_len, \
p10_len, \
kappa, \
c10, incc10, ldc10, \
p10, ldp, \
cntx \
); \
\
/* Pack to p12. For lower storage, this includes the unstored
triangle of c11. */ \
/* NOTE: Since we're only packing partial panels here, we pass in
p1x_len as panel_len_max; otherwise, the packm kernel will zero-
fill the columns up to panel_len_max, which is not what we need
or want to happen. */ \
PASTEMAC(ch,kername) \
( \
conjc12, \
schema, \
p12_dim, \
panel_dim_max, \
p12_len, \
p12_len, \
kappa, \
c12, incc12, ldc12, \
p12, ldp, \
cntx \
); \
\
/* Pack the stored triangle of c11 to p11. */ \
{ \
dim_t j = diagoffc_abs; \
ctype* restrict c11 = c + (j )*ldc; \
ctype* restrict p11 = p + (j )*ldp; \
\
PASTEMAC(ch,scal21ms_mxn_uplo) \
( \
schema, \
uploc, \
conjc, \
panel_dim, \
kappa, \
c11, incc, ldc, \
p11, 1, ldp, ldp \
); \
\
/* If we are packing a micro-panel with Hermitian structure,
we must take special care of the diagonal. Now, if kappa
were guaranteed to be unit, all we would need to do is
explicitly zero out the imaginary part of the diagonal of
p11, in case the diagonal of the source matrix contained
garbage (non-zero) imaginary values. HOWEVER, since kappa
can be non-unit, things become a little more complicated.
In general, we must re-apply the kappa scalar to ONLY the
real part of the diagonal of the source matrix and save
the result to the diagonal of p11. */ \
if ( bli_is_hermitian( strucc ) ) \
{ \
ctype_r* restrict c11_r = ( ctype_r* )c11; \
const dim_t incc2 = 2*incc; \
const dim_t ldc2 = 2*ldc; \
\
PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \
( \
schema, \
panel_dim, \
panel_dim, \
kappa, \
c11_r, incc2, ldc2, \
p11, 1, ldp, ldp \
); \
} \
} \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er )
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
) \
{ \
doff_t diagoffc = panel_dim_off - panel_len_off; \
doff_t diagoffc_abs = bli_abs( diagoffc ); \
ctype* p11 = p + (diagoffc_abs )*ldp; \
\
\
/* Pack the panel. */ \
PASTEMAC(ch,kername) \
( \
conjc, \
schema, \
panel_dim, \
panel_dim_max, \
panel_len, \
panel_len_max, \
kappa, \
c, incc, ldc, \
p, ldp, \
cntx \
); \
\
\
/* Tweak the panel according to its triangular structure */ \
{ \
/* If the diagonal of c is implicitly unit, explicitly set the
the diagonal of the packed panel to kappa. */ \
if ( bli_is_unit_diag( diagc ) ) \
{ \
PASTEMAC(ch,set1ms_mxn_diag) \
( \
schema, \
0, \
0, \
panel_dim, \
panel_dim, \
kappa, \
p11, 1, ldp, ldp \
); \
} \
\
\
/* If requested, invert the diagonal of the packed panel. */ \
if ( invdiag == TRUE ) \
{ \
PASTEMAC(ch,invert1ms_mxn_diag) \
( \
schema, \
0, \
0, \
panel_dim, \
panel_dim, \
p11, 1, ldp, ldp \
); \
} \
\
\
/* Set the region opposite the diagonal of p to zero. To do this,
we need to reference the "unstored" region on the other side of
the diagonal. This amounts to toggling uploc and then shifting
the diagonal offset to shrink the newly referenced region (by
one diagonal). Note that this zero-filling is not needed for
trsm, since the unstored region is not referenced by the trsm
micro-kernel; however, zero-filling is needed for trmm, which
uses the gemm micro-kernel.*/ \
{ \
ctype* restrict zero = PASTEMAC(ch,0); \
uplo_t uplop = uploc; \
doff_t diagoffc11_0 = 0; \
dim_t p11_0_dim = panel_dim - 1; \
\
bli_toggle_uplo( &uplop ); \
bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \
\
/* Note that this macro works a little differently than the setm
operation. Here, we pass in the dimensions of only p11, rather
than the whole micro-panel, and furthermore we pass in the
"shrunken" dimensions of p11, corresponding to the toggling
and shrinking of the diagonal above. The macro will do the
right thing, incrementing the pointer to p11 by the appropriate
leading dimension (ldp or rs_p), and setting only the lower
or upper triangle to zero. */ \
PASTEMAC(ch,set1ms_mxn_uplo) \
( \
schema, \
diagoffc11_0, \
uplop, \
p11_0_dim, \
p11_0_dim, \
zero, \
p11, 1, ldp, ldp \
); \
} \
} \
\
/* If this micro-panel is an edge case in both panel dimension and
length, then it must be a bottom-right corner case, which
typically only happens for micro-panels being packed for trsm.
(It also happens for trmm if kr > 1.) Here, we set the part of
the diagonal that extends into the zero-padded region to
identity. This prevents NaNs and Infs from creeping into the
computation. If this code does execute for trmm, it is okay,
because those 1.0's that extend into the bottom-right region
end up getting muliplied by the 0.0's in the zero-padded region
of the other matrix. */ \
if ( panel_dim != panel_dim_max && \
panel_len != panel_len_max ) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
dim_t offm = panel_dim; \
dim_t offn = panel_len; \
dim_t m_edge = panel_dim_max - panel_dim; \
dim_t n_edge = panel_len_max - panel_len; \
\
PASTEMAC(ch,set1ms_mxn_diag) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
one, \
p, 1, ldp, ldp \
); \
} \
}
INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_struc_cxk_1er.h 0000664 0000000 0000000 00000005054 14634250137 0025263 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROTCO
#define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype* restrict kappa, \
ctype* restrict c, inc_t incc, inc_t ldc, \
ctype* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
);
INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er )
INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er )
INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_struc_cxk_md.c 0000664 0000000 0000000 00000032243 14634250137 0025167 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_GEMM_MD
#undef GENTFUNC2
#define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
\
void PASTEMAC2(chc,chp,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype_p* restrict kappa, \
ctype_c* restrict c, inc_t incc, inc_t ldc, \
ctype_p* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
) \
{ \
if ( bli_is_nat_packed( schema ) ) \
{ \
/* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha
values are never handled when packing for native execution;
instead, they are passed along to the micro-kernel. */ \
if ( !PASTEMAC(chp,eq1)( *kappa ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
\
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
/* NOTE: We ignore kappa for now, since it should be 1.0. */ \
PASTEMAC2(chc,chp,castm) \
( \
( trans_t )conjc, \
panel_dim, \
panel_len, \
c, incc, ldc, \
p, 1, ldp \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
ctype_p* restrict zero = PASTEMAC(chp,0); \
const dim_t i = panel_dim; \
const dim_t m_edge = panel_dim_max - i; \
const dim_t n_edge = panel_len_max; \
ctype_p* p_edge = p + (i )*1; \
\
PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, 1, ldp, \
cntx, \
NULL \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
ctype_p* restrict zero = PASTEMAC(chp,0); \
const dim_t j = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - j; \
ctype_p* p_edge = p + (j )*ldp; \
\
PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m_edge, \
n_edge, \
zero, \
p_edge, 1, ldp, \
cntx, \
NULL \
); \
} \
} \
else if ( bli_is_1r_packed( schema ) ) \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
PASTEMAC2(chc,chp,packm_cxk_1r_md) \
( \
conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
ctype_p* restrict zero = PASTEMAC(chp,0); \
const dim_t offm = panel_dim; \
const dim_t offn = 0; \
const dim_t m_edge = panel_dim_max - panel_dim; \
const dim_t n_edge = panel_len_max; \
\
( void ) zero; \
( void ) m_edge; ( void )offm; \
( void ) n_edge; ( void )offn; \
\
PASTEMAC(chp,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, 1, ldp, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
ctype_p* restrict zero = PASTEMAC(chp,0); \
const dim_t offm = 0; \
const dim_t offn = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - panel_len; \
\
( void ) zero; \
( void ) m_edge; ( void )offm; \
( void ) n_edge; ( void )offn; \
\
PASTEMAC(chp,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, 1, ldp, ldp \
); \
} \
} \
else if ( bli_is_1e_packed( schema ) ) \
{ \
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
\
PASTEMAC2(chc,chp,packm_cxk_1e_md) \
( \
conjc, \
panel_dim, \
panel_len, \
kappa, \
c, incc, ldc, \
p, ldp \
); \
\
/* If panel_dim < panel_dim_max, then we zero those unused rows. */ \
if ( panel_dim < panel_dim_max ) \
{ \
ctype_p* restrict zero = PASTEMAC(chp,0); \
const dim_t offm = panel_dim; \
const dim_t offn = 0; \
const dim_t m_edge = panel_dim_max - panel_dim; \
const dim_t n_edge = panel_len_max; \
\
( void ) zero; \
( void ) m_edge; ( void )offm; \
( void ) n_edge; ( void )offn; \
\
PASTEMAC(chp,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, 1, ldp, ldp \
); \
} \
\
/* If panel_len < panel_len_max, then we zero those unused columns. */ \
if ( panel_len < panel_len_max ) \
{ \
ctype_p* restrict zero = PASTEMAC(chp,0); \
const dim_t offm = 0; \
const dim_t offn = panel_len; \
const dim_t m_edge = panel_dim_max; \
const dim_t n_edge = panel_len_max - panel_len; \
\
( void ) zero; \
( void ) m_edge; ( void )offm; \
( void ) n_edge; ( void )offn; \
\
PASTEMAC(chp,set1ms_mxn) \
( \
schema, \
offm, \
offn, \
m_edge, \
n_edge, \
zero, \
p, 1, ldp, ldp \
); \
} \
} \
else \
{ \
/* Mixed-datatype packing should not occur for any other schemas. */ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
} \
\
\
/*
if ( bli_is_col_packed( schema ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
else if ( bli_is_row_packed( schema ) ) \
PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \
p, rs_p, cs_p, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC2_BASIC0( packm_struc_cxk_md )
INSERT_GENTFUNC2_MIXDP0( packm_struc_cxk_md )
// -----------------------------------------------------------------------------
#undef GENTFUNC2
#define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \
\
void PASTEMAC2(cha,chp,opname) \
( \
conj_t conja, \
dim_t m, \
dim_t n, \
ctype_p* restrict kappa, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_p* restrict p, inc_t ldp \
) \
{ \
const inc_t inca2 = 2 * inca; \
const inc_t lda2 = 2 * lda; \
const inc_t ldp2 = 2 * ldp; \
\
PASTEMAC(chp,ctyper)* restrict kappa_r = ( PASTEMAC(chp,ctyper)* )kappa; \
PASTEMAC(chp,ctyper)* restrict kappa_i = ( PASTEMAC(chp,ctyper)* )kappa + 1; \
PASTEMAC(cha,ctyper)* restrict alpha1_r = ( PASTEMAC(cha,ctyper)* )a; \
PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \
PASTEMAC(chp,ctyper)* restrict pi1_r = ( PASTEMAC(chp,ctyper)* )p; \
PASTEMAC(chp,ctyper)* restrict pi1_i = ( PASTEMAC(chp,ctyper)* )p + ldp; \
\
( void )kappa_i; \
\
if ( PASTEMAC(chp,eq1)( *kappa ) ) \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC2(cha,chp,copyjris) \
( \
*(alpha1_r + i*inca2), \
*(alpha1_i + i*inca2), \
*(pi1_r + i* 1), \
*(pi1_i + i* 1) \
); \
} \
\
alpha1_r += lda2; \
alpha1_i += lda2; \
pi1_r += ldp2; \
pi1_i += ldp2; \
} \
} \
else \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC2(cha,chp,copyris) \
( \
*(alpha1_r + i*inca2), \
*(alpha1_i + i*inca2), \
*(pi1_r + i* 1), \
*(pi1_i + i* 1) \
); \
} \
\
alpha1_r += lda2; \
alpha1_i += lda2; \
pi1_r += ldp2; \
pi1_i += ldp2; \
} \
} \
} \
else \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC3(chp,cha,chp,scal2jris) \
( \
*kappa_r, \
*kappa_i, \
*(alpha1_r + i*inca2), \
*(alpha1_i + i*inca2), \
*(pi1_r + i* 1), \
*(pi1_i + i* 1) \
); \
} \
\
alpha1_r += lda2; \
alpha1_i += lda2; \
pi1_r += ldp2; \
pi1_i += ldp2; \
} \
} \
else \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC3(chp,cha,chp,scal2ris) \
( \
*kappa_r, \
*kappa_i, \
*(alpha1_r + i*inca2), \
*(alpha1_i + i*inca2), \
*(pi1_r + i* 1), \
*(pi1_i + i* 1) \
); \
} \
\
alpha1_r += lda2; \
alpha1_i += lda2; \
pi1_r += ldp2; \
pi1_i += ldp2; \
} \
} \
} \
}
INSERT_GENTFUNC2_BASIC0( packm_cxk_1r_md )
INSERT_GENTFUNC2_MIXDP0( packm_cxk_1r_md )
// -----------------------------------------------------------------------------
#undef GENTFUNC2
#define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \
\
void PASTEMAC2(cha,chp,opname) \
( \
conj_t conja, \
dim_t m, \
dim_t n, \
ctype_p* restrict kappa, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_p* restrict p, inc_t ldp \
) \
{ \
const inc_t inca1 = inca; \
const inc_t lda1 = lda; \
const inc_t ldp1 = ldp; \
\
ctype_a* restrict alpha1_ri = ( ctype_a* )a; \
ctype_p* restrict pi1_ri = ( ctype_p* )p; \
ctype_p* restrict pi1_ir = ( ctype_p* )p + ldp1/2; \
\
( void )inca1; \
\
if ( PASTEMAC(chp,eq1)( *kappa ) ) \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC2(cha,chp,copyj1es) \
( \
*(alpha1_ri + i*inca1), \
*(pi1_ri + i* 1), \
*(pi1_ir + i* 1) \
); \
} \
\
alpha1_ri += lda1; \
pi1_ri += ldp1; \
pi1_ir += ldp1; \
} \
} \
else \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC2(cha,chp,copy1es) \
( \
*(alpha1_ri + i*inca1), \
*(pi1_ri + i* 1), \
*(pi1_ir + i* 1) \
); \
} \
\
alpha1_ri += lda1; \
pi1_ri += ldp1; \
pi1_ir += ldp1; \
} \
} \
} \
else \
{ \
if ( bli_is_conj( conja ) ) \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC3(chp,cha,chp,scal2j1es) \
( \
*kappa, \
*(alpha1_ri + i*inca1), \
*(pi1_ri + i* 1), \
*(pi1_ir + i* 1) \
); \
} \
\
alpha1_ri += lda1; \
pi1_ri += ldp1; \
pi1_ir += ldp1; \
} \
} \
else \
{ \
for ( dim_t k = n; k != 0; --k ) \
{ \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC3(chp,cha,chp,scal21es) \
( \
*kappa, \
*(alpha1_ri + i*inca1), \
*(pi1_ri + i* 1), \
*(pi1_ir + i* 1) \
); \
} \
\
alpha1_ri += lda1; \
pi1_ri += ldp1; \
pi1_ir += ldp1; \
} \
} \
} \
}
INSERT_GENTFUNC2_BASIC0( packm_cxk_1e_md )
INSERT_GENTFUNC2_MIXDP0( packm_cxk_1e_md )
#endif
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_struc_cxk_md.h 0000664 0000000 0000000 00000006100 14634250137 0025165 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT2
#define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \
\
void PASTEMAC2(chc,chp,varname) \
( \
struc_t strucc, \
diag_t diagc, \
uplo_t uploc, \
conj_t conjc, \
pack_t schema, \
bool invdiag, \
dim_t panel_dim, \
dim_t panel_len, \
dim_t panel_dim_max, \
dim_t panel_len_max, \
dim_t panel_dim_off, \
dim_t panel_len_off, \
ctype_p* restrict kappa, \
ctype_c* restrict c, inc_t incc, inc_t ldc, \
ctype_p* restrict p, inc_t ldp, \
inc_t is_p, \
cntx_t* cntx, \
void* params \
);
INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md )
INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md )
#undef GENTPROT2
#define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \
\
void PASTEMAC2(cha,chp,opname) \
( \
conj_t conja, \
dim_t m, \
dim_t n, \
ctype_p* restrict kappa, \
ctype_a* restrict a, inc_t inca, inc_t lda, \
ctype_p* restrict p, inc_t ldp \
);
INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md )
INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md )
INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md )
INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md )
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_thrinfo.c 0000664 0000000 0000000 00000004346 14634250137 0024156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_packm_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
dim_t n_way,
dim_t work_id,
bszid_t bszid,
thrinfo_t* sub_node
)
{
bli_thrinfo_init
(
thread,
ocomm, ocomm_id,
n_way, work_id,
FALSE,
BLIS_NO_PART,
sub_node
);
}
void bli_packm_thrinfo_init_single
(
thrinfo_t* thread
)
{
bli_packm_thrinfo_init
(
thread,
&BLIS_SINGLE_COMM, 0,
1,
0,
BLIS_NO_PART,
NULL
);
}
cython-blis-1.0.0/blis/_src/frame/1m/packm/bli_packm_thrinfo.h 0000664 0000000 0000000 00000005762 14634250137 0024166 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// thrinfo_t macros specific to packm.
//
/*
#define bli_packm_thread_my_iter( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
*/
#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \
\
( i % n_way == work_id % n_way )
#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \
\
( start <= i && i < end )
// Define a general-purpose version of bli_packm_my_iter() whose definition
// depends on whether slab or round-robin partitioning was requested at
// configure-time.
#ifdef BLIS_ENABLE_JRIR_SLAB
#define bli_packm_my_iter bli_packm_my_iter_sl
#else // BLIS_ENABLE_JRIR_RR
#define bli_packm_my_iter bli_packm_my_iter_rr
#endif
//
// thrinfo_t APIs specific to packm.
//
#if 0
thrinfo_t* bli_packm_thrinfo_create
(
thrcomm_t* ocomm,
dim_t ocomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
);
#endif
void bli_packm_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
dim_t n_way,
dim_t work_id,
bszid_t bszid,
thrinfo_t* sub_node
);
void bli_packm_thrinfo_init_single
(
thrinfo_t* thread
);
#if 0
void bli_packm_thrinfo_free
(
thrinfo_t* thread
);
#endif
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/ 0000775 0000000 0000000 00000000000 14634250137 0020674 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm.h 0000664 0000000 0000000 00000003444 14634250137 0023336 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_unpackm_cntl.h"
#include "bli_unpackm_check.h"
#include "bli_unpackm_int.h"
#include "bli_unpackm_blk_var1.h"
#include "bli_unpackm_cxk.h"
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_blk_var1.c 0000664 0000000 0000000 00000020603 14634250137 0025106 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T unpackm_fp
typedef void (*FUNCPTR_T)(
struc_t strucc,
doff_t diagoffc,
diag_t diagc,
uplo_t uploc,
trans_t transc,
dim_t m,
dim_t n,
dim_t m_panel,
dim_t n_panel,
void* p, inc_t rs_p, inc_t cs_p,
dim_t pd_p, inc_t ps_p,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx
);
static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
void bli_unpackm_blk_var1
(
obj_t* p,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_cp = bli_obj_dt( c );
// Normally we take the parameters from the source argument. But here,
// the packm/unpackm framework is not yet solidified enough for us to
// assume that at this point struc(P) == struc(C), (ie: since
// densification may have marked P's structure as dense when the root
// is upper or lower). So, we take the struc field from C, not P.
struc_t strucc = bli_obj_struc( c );
doff_t diagoffc = bli_obj_diag_offset( c );
diag_t diagc = bli_obj_diag( c );
uplo_t uploc = bli_obj_uplo( c );
// Again, normally the trans argument is on the source matrix. But we
// know that the packed matrix is not transposed. If there is to be a
// transposition, it is because C was originally transposed when packed.
// Thus, we query C for the trans status, not P. Also, we only query
// the trans status (not the conjugation status), since we probably
// don't want to un-conjugate if the original matrix was conjugated
// when packed.
trans_t transc = bli_obj_onlytrans_status( c );
dim_t m_c = bli_obj_length( c );
dim_t n_c = bli_obj_width( c );
dim_t m_panel = bli_obj_panel_length( c );
dim_t n_panel = bli_obj_panel_width( c );
void* buf_p = bli_obj_buffer_at_off( p );
inc_t rs_p = bli_obj_row_stride( p );
inc_t cs_p = bli_obj_col_stride( p );
dim_t pd_p = bli_obj_panel_dim( p );
inc_t ps_p = bli_obj_panel_stride( p );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
FUNCPTR_T f;
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_cp];
// Invoke the function.
f( strucc,
diagoffc,
diagc,
uploc,
transc,
m_c,
n_c,
m_panel,
n_panel,
buf_p, rs_p, cs_p,
pd_p, ps_p,
buf_c, rs_c, cs_c,
cntx );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_panel, \
dim_t n_panel, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
ctype* restrict c_begin; \
ctype* restrict p_begin; \
\
dim_t iter_dim; \
dim_t num_iter; \
dim_t it, ic, ip; \
dim_t ic0, ip0; \
doff_t ic_inc, ip_inc; \
doff_t diagoffc_i; \
doff_t diagoffc_inc; \
dim_t panel_len; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
inc_t vs_c; \
inc_t incc, ldc; \
inc_t ldp; \
dim_t* m_panel_full; \
dim_t* n_panel_full; \
\
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_negate_diag_offset( &diagoffc ); \
bli_toggle_uplo( &uploc ); \
bli_toggle_trans( &transc ); \
} \
\
/* If the strides of p indicate row storage, then we are packing to
column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
{ \
/* Prepare to unpack from column panels. */ \
iter_dim = n; \
panel_len = m; \
panel_dim_max = pd_p; \
incc = cs_c; \
ldc = rs_c; \
vs_c = cs_c; \
diagoffc_inc = -( doff_t)panel_dim_max; \
ldp = rs_p; \
m_panel_full = &m; \
n_panel_full = &panel_dim_i; \
} \
else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
{ \
/* Prepare to unpack from row panels. */ \
iter_dim = m; \
panel_len = n; \
panel_dim_max = pd_p; \
incc = rs_c; \
ldc = cs_c; \
vs_c = rs_c; \
diagoffc_inc = ( doff_t )panel_dim_max; \
ldp = cs_p; \
m_panel_full = &panel_dim_i; \
n_panel_full = &n; \
} \
\
/* Compute the total number of iterations we'll need. */ \
num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
ip0 = 0; \
ip_inc = 1; \
} \
\
for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \
ic += ic_inc, ip += ip_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
diagoffc_i = diagoffc + (ip )*diagoffc_inc; \
\
p_begin = p_cast + ip * ps_p; \
c_begin = c_cast + ic * vs_c; \
\
/* If the current panel of C intersects the diagonal AND is upper or
lower stored, then we must call scal2m. Otherwise, we can use a
variant that is oblivious to structure and storage (and thus tends
to be faster). */ \
if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \
bli_is_upper_or_lower( uploc ) ) \
{ \
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
( \
diagoffc_i, \
diagc, \
uploc, \
transc, \
*m_panel_full, \
*n_panel_full, \
one, \
p_begin, rs_p, cs_p, \
c_begin, rs_c, cs_c, \
cntx, \
NULL \
); \
} \
else \
{ \
/* Pack the current panel. */ \
PASTEMAC(ch,unpackm_cxk) \
( \
BLIS_NO_CONJUGATE, \
panel_dim_i, \
panel_len, \
one, \
p_begin, ldp, \
c_begin, incc, ldc, \
cntx \
); \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel_full, *n_panel_full, \
p_begin, rs_p, cs_p, "%4.1f", "" );*/ \
} \
\
}
INSERT_GENTFUNC_BASIC0( unpackm_blk_var1 )
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_blk_var1.h 0000664 0000000 0000000 00000004445 14634250137 0025121 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackm_blk_var1
(
obj_t* p,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
);
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
struc_t strucc, \
doff_t diagoffc, \
diag_t diagc, \
uplo_t uploc, \
trans_t transc, \
dim_t m, \
dim_t n, \
dim_t m_panel, \
dim_t n_panel, \
void* p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( unpackm_blk_var1 )
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_check.c 0000664 0000000 0000000 00000004632 14634250137 0024466 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_unpackm_int_check
(
obj_t* p,
obj_t* a,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( p );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_conformal_dims( p, a );
bli_check_error_code( e_val );
// Check pack status.
e_val = bli_check_packm_schema_on_unpack( p );
bli_check_error_code( e_val );
// Check control tree pointer
// NOTE: We can't check the control tree until we stop interpreting a
// NULL value (in bli_unpackm_int()) as a request to skip the operation.
//e_val = bli_check_valid_cntl( ( void* )cntl );
//bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_check.h 0000664 0000000 0000000 00000003354 14634250137 0024473 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackm_int_check
(
obj_t* p,
obj_t* a,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_cntl.c 0000664 0000000 0000000 00000005322 14634250137 0024346 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cntl_t* bli_unpackm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
void_fp unpackm_var_func,
cntl_t* sub_node
)
{
cntl_t* cntl;
unpackm_params_t* params;
err_t r_val;
// NOTE: If this function is ever called, figure out whether the
// bli_malloc_intl() below needs to be changed to bli_sba_acquire().
bli_abort();
// Allocate an unpackm_params_t struct.
params = bli_malloc_intl( sizeof( unpackm_params_t ), &r_val );
// Initialize the unpackm_params_t struct.
params->size = sizeof( unpackm_params_t );
params->var_func = unpackm_var_func;
// It's important that we set the bszid field to BLIS_NO_PART to indicate
// that no blocksize partitioning is performed. bli_cntl_free() will rely
// on this information to know how to step through the thrinfo_t tree in
// sync with the cntl_t tree.
cntl = bli_cntl_create_node
(
rntm,
BLIS_NOID,
BLIS_NO_PART,
var_func,
params,
sub_node
);
return cntl;
}
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_cntl.h 0000664 0000000 0000000 00000004325 14634250137 0024355 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct unpackm_params_s
{
uint64_t size; // size field must be present and come first.
unpackm_var_oft var_func;
};
typedef struct unpackm_params_s unpackm_params_t;
#define bli_cntl_unpackm_params_var_func( cntl ) \
\
( ( (unpackm_params_t*)(cntl)->params )->var_func )
// -----------------------------------------------------------------------------
cntl_t* bli_unpackm_cntl_create_node
(
rntm_t* rntm,
void_fp var_func,
void_fp unpackm_var_func,
cntl_t* sub_node
);
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_cxk.c 0000664 0000000 0000000 00000006007 14634250137 0024174 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjp, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* kappa, \
ctype* p, inc_t ldp, \
ctype* a, inc_t inca, inc_t lda, \
cntx_t* cntx \
) \
{ \
num_t dt = PASTEMAC(ch,type); \
l1mkr_t ker_id = panel_dim; \
\
PASTECH2(ch,opname,_ker_ft) f; \
\
/* Query the context for the unpackm kernel corresponding to the current
panel dimension, or kernel id. If the id is invalid, the function will
return NULL. */ \
f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \
\
/* If there exists a kernel implementation for the micro-panel dimension
provided, we invoke the implementation. Otherwise, we use scal2m. */ \
if ( f != NULL ) \
{ \
f \
( \
conjp, \
panel_len, \
kappa, \
p, ldp, \
a, inca, lda, \
cntx \
); \
} \
else \
{ \
trans_t transp = ( trans_t )conjp; \
\
/* Treat the micro-panel as panel_dim x panel_len and column-stored
(unit row stride). */ \
PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
( \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
transp, \
panel_dim, \
panel_len, \
kappa, \
p, 1, ldp, \
a, inca, lda, \
cntx, \
NULL \
); \
} \
}
INSERT_GENTFUNC_BASIC0( unpackm_cxk )
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_cxk.h 0000664 0000000 0000000 00000003760 14634250137 0024204 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conjp, \
dim_t panel_dim, \
dim_t panel_len, \
ctype* kappa, \
ctype* p, inc_t ldp, \
ctype* a, inc_t inca, inc_t lda, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( unpackm_cxk )
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_int.c 0000664 0000000 0000000 00000004777 14634250137 0024215 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_unpackm_int
(
obj_t* p,
obj_t* a,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_init_once();
unpackm_var_oft f;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_unpackm_int_check( p, a, cntx );
// If p was aliased to a during the pack stage (because it was already
// in an acceptable packed/contiguous format), then no unpack is actually
// necessary, so we return.
if ( bli_obj_is_alias_of( p, a ) ) return;
// Extract the function pointer from the current control tree node.
f = bli_cntl_unpackm_params_var_func( cntl );
// Invoke the variant.
if ( bli_thread_am_ochief( thread ) )
{
f
(
p,
a,
cntx,
cntl,
thread
);
}
// Barrier so that unpacking is done before computation.
bli_thread_barrier( thread );
}
cython-blis-1.0.0/blis/_src/frame/1m/unpackm/bli_unpackm_int.h 0000664 0000000 0000000 00000003425 14634250137 0024207 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_unpackm_int
(
obj_t* p,
obj_t* a,
cntx_t* cntx,
cntl_t* cntl,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/2/ 0000775 0000000 0000000 00000000000 14634250137 0017062 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/bli_l2.h 0000664 0000000 0000000 00000004776 14634250137 0020414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l2_check.h"
// Define function types.
#include "bli_l2_ft_unb.h"
// Prototype object APIs (expert and non-expert).
#include "bli_oapi_ex.h"
#include "bli_l2_oapi.h"
#include "bli_xapi_undef.h"
#include "bli_oapi_ba.h"
#include "bli_l2_oapi.h"
#include "bli_xapi_undef.h"
// Prototype typed APIs (expert and non-expert).
#include "bli_tapi_ex.h"
#include "bli_l2_tapi.h"
#include "bli_l2_ft.h"
#include "bli_xapi_undef.h"
#include "bli_tapi_ba.h"
#include "bli_l2_tapi.h"
#include "bli_l2_ft.h"
#include "bli_xapi_undef.h"
// Generate function pointer arrays for tapi functions (expert only).
#include "bli_l2_fpa.h"
// Operation-specific headers
#include "bli_gemv.h"
#include "bli_ger.h"
#include "bli_hemv.h"
#include "bli_her.h"
#include "bli_her2.h"
#include "bli_symv.h"
#include "bli_syr.h"
#include "bli_syr2.h"
#include "bli_trmv.h"
#include "bli_trsv.h"
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_check.c 0000664 0000000 0000000 00000024415 14634250137 0021534 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Perform checks common to gemv/hemv/symv/trmv/trsv.
bli_xxmv_check( alpha, a, x, beta, y );
// Check object structure.
e_val = bli_check_general_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
}
void bli_hemv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Perform checks common to gemv/hemv/symv/trmv/trsv.
bli_xxmv_check( alpha, a, x, beta, y );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_hermitian_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
}
void bli_symv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Perform checks common to gemv/hemv/symv/trmv/trsv.
bli_xxmv_check( alpha, a, x, beta, y );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_symmetric_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
}
void bli_trmv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x
)
{
err_t e_val;
// Perform checks common to gemv/hemv/symv/trmv/trsv.
bli_xxmv_check( alpha, a, x, alpha, x );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_triangular_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
}
void bli_trsv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x
)
{
err_t e_val;
// Perform checks common to gemv/hemv/symv/trmv/trsv.
bli_xxmv_check( alpha, a, x, alpha, x );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_triangular_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
}
void bli_ger_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a
)
{
err_t e_val;
// Perform checks common to ger/her/her2/syr/syr2.
bli_xxr_check( alpha, x, y, a );
// Check object structure.
e_val = bli_check_general_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
}
void bli_her_check
(
obj_t* alpha,
obj_t* x,
obj_t* a
)
{
err_t e_val;
// Perform checks common to ger/her/her2/syr/syr2.
bli_xxr_check( alpha, x, x, a );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_hermitian_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
}
void bli_her2_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a
)
{
err_t e_val;
// Perform checks common to ger/her/her2/syr/syr2.
bli_xxr_check( alpha, x, y, a );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_hermitian_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
}
void bli_syr_check
(
obj_t* alpha,
obj_t* x,
obj_t* a
)
{
err_t e_val;
// Perform checks common to ger/her/her2/syr/syr2.
bli_xxr_check( alpha, x, x, a );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_symmetric_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
}
void bli_syr2_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a
)
{
err_t e_val;
// Perform checks common to ger/her/her2/syr/syr2.
bli_xxr_check( alpha, x, y, a );
// Check squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check object structure.
e_val = bli_check_symmetric_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( a, x );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( a, y );
bli_check_error_code( e_val );
}
// -----------------------------------------------------------------------------
void bli_xxmv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( x, bli_obj_width_after_trans( a ) );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( y, bli_obj_length_after_trans( a ) );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
void bli_xxr_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( x, bli_obj_length_after_trans( a ) );
bli_check_error_code( e_val );
e_val = bli_check_vector_dim_equals( y, bli_obj_width_after_trans( a ) );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_check.h 0000664 0000000 0000000 00000005535 14634250137 0021543 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
);
GENPROT( gemv )
GENPROT( hemv )
GENPROT( symv )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* a \
);
GENPROT( ger )
GENPROT( her2 )
GENPROT( syr2 )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* a \
);
GENPROT( her )
GENPROT( syr )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x \
);
GENPROT( trmv )
GENPROT( trsv )
// -----------------------------------------------------------------------------
void bli_xxmv_check
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y
);
void bli_xxr_check
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a
);
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_fpa.c 0000664 0000000 0000000 00000006667 14634250137 0021236 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \
PASTECH(opname,BLIS_TAPI_EX_SUF) ); \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \
{ \
return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \
}
GENFRONT( gemv )
GENFRONT( ger )
GENFRONT( hemv )
GENFRONT( symv )
GENFRONT( her )
GENFRONT( syr )
GENFRONT( her2 )
GENFRONT( syr2 )
GENFRONT( trmv )
GENFRONT( trsv )
//
// Define function pointer query interfaces for level-2 implementations.
//
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
GENARRAY_FPA( PASTECH2(opname,_unb,_vft), \
varname ); \
\
PASTECH2(opname,_unb,_vft) \
PASTEMAC(varname,_qfp)( num_t dt ) \
{ \
return PASTECH(varname,_fpa)[ dt ]; \
}
GENFRONT( gemv, gemv_unb_var1 )
GENFRONT( gemv, gemv_unb_var2 )
GENFRONT( gemv, gemv_unf_var1 )
GENFRONT( gemv, gemv_unf_var2 )
GENFRONT( ger, ger_unb_var1 )
GENFRONT( ger, ger_unb_var2 )
GENFRONT( hemv, hemv_unb_var1 )
GENFRONT( hemv, hemv_unb_var2 )
GENFRONT( hemv, hemv_unb_var3 )
GENFRONT( hemv, hemv_unb_var4 )
GENFRONT( hemv, hemv_unf_var1 )
GENFRONT( hemv, hemv_unf_var3 )
GENFRONT( hemv, hemv_unf_var1a )
GENFRONT( hemv, hemv_unf_var3a )
GENFRONT( her, her_unb_var1 )
GENFRONT( her, her_unb_var2 )
GENFRONT( her2, her2_unb_var1 )
GENFRONT( her2, her2_unb_var2 )
GENFRONT( her2, her2_unb_var3 )
GENFRONT( her2, her2_unb_var4 )
GENFRONT( her2, her2_unf_var1 )
GENFRONT( her2, her2_unf_var4 )
GENFRONT( trmv, trmv_unb_var1 )
GENFRONT( trmv, trmv_unb_var2 )
GENFRONT( trmv, trmv_unf_var1 )
GENFRONT( trmv, trmv_unf_var2 )
GENFRONT( trsv, trsv_unb_var1 )
GENFRONT( trsv, trsv_unb_var2 )
GENFRONT( trsv, trsv_unf_var1 )
GENFRONT( trsv, trsv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_fpa.h 0000664 0000000 0000000 00000006131 14634250137 0021225 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( opname ) \
\
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt );
GENPROT( gemv )
GENPROT( ger )
GENPROT( hemv )
GENPROT( symv )
GENPROT( her )
GENPROT( syr )
GENPROT( her2 )
GENPROT( syr2 )
GENPROT( trmv )
GENPROT( trsv )
//
// Prototype function pointer query interfaces for level-2 implementations.
//
#undef GENPROT
#define GENPROT( opname, varname ) \
\
PASTECH2(opname,_unb,_vft) \
PASTEMAC(varname,_qfp)( num_t dt );
GENPROT( gemv, gemv_unb_var1 )
GENPROT( gemv, gemv_unb_var2 )
GENPROT( gemv, gemv_unf_var1 )
GENPROT( gemv, gemv_unf_var2 )
GENPROT( ger, ger_unb_var1 )
GENPROT( ger, ger_unb_var2 )
GENPROT( hemv, hemv_unb_var1 )
GENPROT( hemv, hemv_unb_var2 )
GENPROT( hemv, hemv_unb_var3 )
GENPROT( hemv, hemv_unb_var4 )
GENPROT( hemv, hemv_unf_var1 )
GENPROT( hemv, hemv_unf_var3 )
GENPROT( hemv, hemv_unf_var1a )
GENPROT( hemv, hemv_unf_var3a )
GENPROT( her, her_unb_var1 )
GENPROT( her, her_unb_var2 )
GENPROT( her2, her2_unb_var1 )
GENPROT( her2, her2_unb_var2 )
GENPROT( her2, her2_unb_var3 )
GENPROT( her2, her2_unb_var4 )
GENPROT( her2, her2_unf_var1 )
GENPROT( her2, her2_unf_var4 )
GENPROT( trmv, trmv_unb_var1 )
GENPROT( trmv, trmv_unb_var2 )
GENPROT( trmv, trmv_unf_var1 )
GENPROT( trmv, trmv_unf_var2 )
GENPROT( trsv, trsv_unb_var1 )
GENPROT( trsv, trsv_unb_var2 )
GENPROT( trsv, trsv_unf_var1 )
GENPROT( trsv, trsv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_ft.h 0000664 0000000 0000000 00000011126 14634250137 0021070 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// -- Level-2 function types ---------------------------------------------------
//
// gemv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( gemv )
// ger
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( ger )
// hemv, symv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( hemv )
INSERT_GENTDEF( symv )
// her
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
uplo_t uploa, \
conj_t conjx, \
dim_t m, \
ctype_r* alpha, \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEFR( her )
// syr
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
uplo_t uploa, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( syr )
// her2, syr2
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
uplo_t uploa, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( her2 )
INSERT_GENTDEF( syr2 )
// trmv, trsv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTDEF( trmv )
INSERT_GENTDEF( trsv )
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_ft_unb.h 0000664 0000000 0000000 00000010537 14634250137 0021741 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L2_FT_UNB_H
#define BLIS_L2_FT_UNB_H
//
// -- Level-2 function types ---------------------------------------------------
//
// gemv
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( gemv )
// ger
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
);
INSERT_GENTDEF( ger )
// hemv (and symv)
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTDEF( hemv )
// her (and syr)
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
( \
uplo_t uploa, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
);
INSERT_GENTDEFR( her )
// her2 (and syr2)
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
( \
uplo_t uploa, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
);
INSERT_GENTDEF( her2 )
// trmv (and trsv)
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTDEF( trmv )
INSERT_GENTDEF( trsv )
#endif
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_oapi.c 0000664 0000000 0000000 00000030150 14634250137 0021400 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the object API macros.
#ifdef BLIS_ENABLE_OAPI
//
// Define object-based interfaces.
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( a ); \
\
trans_t transa = bli_obj_conjtrans_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
dim_t m = bli_obj_length( a ); \
dim_t n = bli_obj_width( a ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_alpha; \
void* buf_beta; \
\
obj_t alpha_local; \
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
transa, \
conjx, \
m, \
n, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
buf_beta, \
buf_y, incy, \
cntx, \
rntm \
); \
}
GENFRONT( gemv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* a \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( a ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
dim_t m = bli_obj_length( a ); \
dim_t n = bli_obj_width( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, y, a ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
conjx, \
conjy, \
m, \
n, \
buf_alpha, \
buf_x, incx, \
buf_y, incy, \
buf_a, rs_a, cs_a, \
cntx, \
rntm \
); \
}
GENFRONT( ger )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uploa = bli_obj_uplo( a ); \
conj_t conja = bli_obj_conj_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
dim_t m = bli_obj_length( a ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_alpha; \
void* buf_beta; \
\
obj_t alpha_local; \
obj_t beta_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
beta, &beta_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
uploa, \
conja, \
conjx, \
m, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
buf_beta, \
buf_y, incy, \
cntx, \
rntm \
); \
}
GENFRONT( hemv )
GENFRONT( symv )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* a \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uploa = bli_obj_uplo( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
dim_t m = bli_obj_length( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, a ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
uploa, \
conjx, \
m, \
buf_alpha, \
buf_x, incx, \
buf_a, rs_a, cs_a, \
cntx, \
rntm \
); \
}
GENFRONT( her )
GENFRONT( syr )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* a \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uploa = bli_obj_uplo( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
dim_t m = bli_obj_length( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, x, y, a ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
uploa, \
conjx, \
conjy, \
m, \
buf_alpha, \
buf_x, incx, \
buf_y, incy, \
buf_a, rs_a, cs_a, \
cntx, \
rntm \
); \
}
GENFRONT( her2 )
GENFRONT( syr2 )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_OAPI_EX_DECLS \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uploa = bli_obj_uplo( a ); \
trans_t transa = bli_obj_conjtrans_status( a ); \
diag_t diaga = bli_obj_diag( a ); \
dim_t m = bli_obj_length( a ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_alpha; \
\
obj_t alpha_local; \
\
if ( bli_error_checking_is_enabled() ) \
PASTEMAC(opname,_check)( alpha, a, x ); \
\
/* Create local copy-casts of scalars (and apply internal conjugation
as needed). */ \
bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \
alpha, &alpha_local ); \
buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \
PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \
\
f \
( \
uploa, \
transa, \
diaga, \
m, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
cntx, \
rntm \
); \
}
GENFRONT( trmv )
GENFRONT( trsv )
#endif
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_oapi.h 0000664 0000000 0000000 00000005322 14634250137 0021410 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( gemv )
GENPROT( hemv )
GENPROT( symv )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* a \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( ger )
GENPROT( her2 )
GENPROT( syr2 )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* a \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( her )
GENPROT( syr )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x \
BLIS_OAPI_EX_PARAMS \
);
GENPROT( trmv )
GENPROT( trsv )
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_oapi_ba.c 0000664 0000000 0000000 00000003667 14634250137 0022057 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_oapi_ba.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l2_oapi.c"
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_oapi_ex.c 0000664 0000000 0000000 00000003665 14634250137 0022107 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_oapi_ex.h"
// Define the macro protecting the object API definitions.
#define BLIS_ENABLE_OAPI
// Include the object API definitions here.
#include "bli_l2_oapi.c"
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_tapi.c 0000664 0000000 0000000 00000033253 14634250137 0021414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Guard the function definitions so that they are only compiled when
// #included from files that define the typed API macros.
#ifdef BLIS_ENABLE_TAPI
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
dim_t m_y, n_x; \
\
/* Determine the dimensions of y and x. */ \
bli_set_dims_with_trans( transa, m, n, &m_y, &n_x ); \
\
/* If y has zero elements, return early. */ \
if ( bli_zero_dim1( m_y ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* If x has zero elements, or if alpha is zero, scale y by beta and
return early. */ \
if ( bli_zero_dim1( n_x ) || PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m_y, \
beta, \
y, incy, \
cntx, \
NULL \
); \
return; \
} \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_does_notrans( transa ) ) \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \
else /* column or general stored */ f = PASTEMAC(ch,rvarname); \
} \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
transa, \
conjx, \
m, \
n, \
alpha, \
a, rs_a, cs_a, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
/* If x or y has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
conjx, \
conjy, \
m, \
n, \
alpha, \
x, incx, \
y, incy, \
a, rs_a, cs_a, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* If x has zero elements, or if alpha is zero, scale y by beta and
return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
return; \
} \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_is_lower( uploa ) ) \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
} \
else /* if ( bli_is_upper( uploa ) ) */ \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \
else /* column or general stored */ f = PASTEMAC(ch,rvarname); \
} \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
uploa, \
conja, \
conjx, \
conjh, /* used by variants to distinguish hemv from symv */ \
m, \
alpha, \
a, rs_a, cs_a, \
x, incx, \
beta, \
y, incy, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC4( hemv, hemv, BLIS_CONJUGATE, hemv_unf_var1, hemv_unf_var3 )
INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_var3 )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, ftname, conjh, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conjx, \
dim_t m, \
ctype_r* alpha, \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
ctype alpha_local; \
\
/* If x has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \
\
/* Make a local copy of alpha, cast into the complex domain. This
allows us to use the same underlying her variants to implement
both her and syr operations. */ \
PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_is_lower( uploa ) ) \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
} \
else /* if ( bli_is_upper( uploa ) ) */ \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \
else /* column or general stored */ f = PASTEMAC(ch,rvarname); \
} \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
uploa, \
conjx, \
conjh, /* used by variants to distinguish her from syr */ \
m, \
&alpha_local, \
x, incx, \
a, rs_a, cs_a, \
cntx \
); \
}
INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
/* If x has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_is_lower( uploa ) ) \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
} \
else /* if ( bli_is_upper( uploa ) ) */ \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \
else /* column or general stored */ f = PASTEMAC(ch,rvarname); \
} \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
uploa, \
conjx, \
conjh, /* used by variants to distinguish her2 from syr2 */ \
m, \
alpha, \
x, incx, \
a, rs_a, cs_a, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
/* If x has zero elements, or if alpha is zero, return early. */ \
if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_is_lower( uploa ) ) \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
} \
else /* if ( bli_is_upper( uploa ) ) */ \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \
else /* column or general stored */ f = PASTEMAC(ch,rvarname); \
} \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
uploa, \
conjx, \
conjy, \
conjh, \
m, \
alpha, \
x, incx, \
y, incy, \
a, rs_a, cs_a, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC4( her2, her2, BLIS_CONJUGATE, her2_unf_var1, her2_unf_var4 )
INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_var4 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \
\
void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
) \
{ \
bli_init_once(); \
\
BLIS_TAPI_EX_DECLS \
\
/* If x has zero elements, return early. */ \
if ( bli_zero_dim1( m ) ) return; \
\
/* Obtain a valid context from the gks if necessary. */ \
if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
\
/* If alpha is zero, set x to zero and return early. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
alpha, \
x, incx, \
cntx, \
NULL \
); \
return; \
} \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,ftname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_does_notrans( transa ) ) \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \
else /* column or general stored */ f = PASTEMAC(ch,cvarname); \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \
else /* column or general stored */ f = PASTEMAC(ch,rvarname); \
} \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
uploa, \
transa, \
diaga, \
m, \
alpha, \
a, rs_a, cs_a, \
x, incx, \
cntx \
); \
}
INSERT_GENTFUNC_BASIC3( trmv, trmv, trmv_unf_var1, trmv_unf_var2 )
INSERT_GENTFUNC_BASIC3( trsv, trmv, trsv_unf_var1, trsv_unf_var2 )
#endif
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_tapi.h 0000664 0000000 0000000 00000011065 14634250137 0021416 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( gemv )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( ger )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( hemv )
INSERT_GENTPROT_BASIC0( symv )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conjx, \
dim_t m, \
ctype_r* alpha, \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROTR_BASIC0( her )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( syr )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( her2 )
INSERT_GENTPROT_BASIC0( syr2 )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx \
BLIS_TAPI_EX_PARAMS \
);
INSERT_GENTPROT_BASIC0( trmv )
INSERT_GENTPROT_BASIC0( trsv )
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_tapi_ba.c 0000664 0000000 0000000 00000003665 14634250137 0022062 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// omitting expert parameters.
#include "bli_tapi_ba.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l2_tapi.c"
cython-blis-1.0.0/blis/_src/frame/2/bli_l2_tapi_ex.c 0000664 0000000 0000000 00000003663 14634250137 0022112 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Include cpp macros that instantiate the API definition templates as
// having expert parameters.
#include "bli_tapi_ex.h"
// Define the macro protecting the typed API definitions.
#define BLIS_ENABLE_TAPI
// Include the typed API definitions here.
#include "bli_l2_tapi.c"
cython-blis-1.0.0/blis/_src/frame/2/gemv/ 0000775 0000000 0000000 00000000000 14634250137 0020020 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/gemv/amd/ 0000775 0000000 0000000 00000000000 14634250137 0020561 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c 0000664 0000000 0000000 00000013465 14634250137 0025145 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, scalvsuf, axpyfsuf, fusefac ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
/*const num_t dt = PASTEMAC(ch,type);*/ \
\
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_elem, &n_iter, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
/* y = beta * y; */ \
/* NOTE: We don't explicitly handle the case where beta == 0 here
since that behavior is handled within the scalv kernel itself. */ \
PASTEMAC2(ch,scalv,scalvsuf) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
beta, \
y, incy, \
cntx \
); \
\
/* If alpha == 0, then we are done. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \
\
/*PASTECH(ch,axpyf_ker_ft) kfp_af;*/ \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
/*kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );*/ \
/*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );*/ \
b_fuse = fusefac; \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (0 )*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
\
/* y = y + alpha * A1 * x1; */ \
/*kfp_af*/ \
PASTEMAC2(ch,axpyf,axpyfsuf) \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, rs_at, cs_at, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
}
//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
GENTFUNC( float, s, gemv_unf_var2, _zen_int10, _zen_int_5, 5 )
GENTFUNC( double, d, gemv_unf_var2, _zen_int10, _zen_int_16x4, 4 )
GENTFUNC( scomplex, c, gemv_unf_var2, _zen_int10, _zen_int_4, 4 )
//GENTFUNC( dcomplex, z, gemv_unf_var2, _zen_int10, _ex, 1 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_elem, &n_iter, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyf_ker_ft) kfp_af; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (0 )*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
\
/* y = y + alpha * A1 * x1; */ \
kfp_af \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, rs_at, cs_at, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
}
//INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
GENTFUNC( dcomplex, z, gemv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv.h 0000664 0000000 0000000 00000003470 14634250137 0021761 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_gemv_cntl.h"
//#include "bli_gemv_front.h"
//#include "bli_gemv_int.h"
#include "bli_gemv_var.h"
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_unb_var1.c 0000664 0000000 0000000 00000005664 14634250137 0023560 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a1t; \
ctype* x1; \
ctype* psi1; \
dim_t i; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_iter, &n_elem, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointer. */ \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < n_iter; ++i ) \
{ \
a1t = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (0 )*incy; \
psi1 = y + (i )*incy; \
\
/* psi1 = beta * psi1 + alpha * a1t * x1; */ \
kfp_dv \
( \
conja, \
conjx, \
n_elem, \
alpha, \
a1t, cs_at, \
x1, incx, \
beta, \
psi1, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( gemv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_unb_var2.c 0000664 0000000 0000000 00000006763 14634250137 0023562 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* a1; \
ctype* chi1; \
ctype* y1; \
ctype alpha_chi1; \
dim_t i; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_elem, &n_iter, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < n_iter; ++i ) \
{ \
a1 = a + (0 )*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
\
/* y = y + alpha * chi1 * a1; */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
\
kfp_av \
( \
conja, \
n_elem, \
&alpha_chi1, \
a1, rs_at, \
y1, incy, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( gemv_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_unf_var1.c 0000664 0000000 0000000 00000006145 14634250137 0023557 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_iter, &n_elem, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (0 )*incy; \
y1 = y + (i )*incy; \
\
/* y1 = beta * y1 + alpha * A1 * x; */ \
kfp_df \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, cs_at, rs_at, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
\
} \
}
INSERT_GENTFUNC_BASIC0( gemv_unf_var1 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_unf_var2.c 0000664 0000000 0000000 00000007071 14634250137 0023557 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_elem, &n_iter, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n_elem, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyf_ker_ft) kfp_af; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (0 )*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
\
/* y = y + alpha * A1 * x1; */ \
kfp_af \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, rs_at, cs_at, \
x1, incx, \
y1, incy, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( gemv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_var.h 0000664 0000000 0000000 00000005230 14634250137 0022625 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( gemv_blk_var1 )
GENPROT( gemv_blk_var2 )
GENPROT( gemv_unb_var1 )
GENPROT( gemv_unb_var2 )
GENPROT( gemv_unf_var1 )
GENPROT( gemv_unf_var2 )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( gemv_unb_var1 )
INSERT_GENTPROT_BASIC0( gemv_unb_var2 )
INSERT_GENTPROT_BASIC0( gemv_unf_var1 )
INSERT_GENTPROT_BASIC0( gemv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_var_oapi.c 0000664 0000000 0000000 00000006200 14634250137 0023626 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( a ); \
\
trans_t transa = bli_obj_conjtrans_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
\
dim_t m = bli_obj_length( a ); \
dim_t n = bli_obj_width( a ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
transa, \
conjx, \
m, \
n, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
buf_beta, \
buf_y, incy, \
cntx \
); \
} \
GENFRONT( gemv, gemv_unb_var1 )
GENFRONT( gemv, gemv_unb_var2 )
GENFRONT( gemv, gemv_unf_var1 )
GENFRONT( gemv, gemv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/bli_gemv_var_oapi.c.prev 0000664 0000000 0000000 00000006206 14634250137 0024607 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( ftname, opname ) \
\
/*static gemv_vft GENARRAY(ftypes,gemv_unb_var1);*/ \
static GENARRAY_VFP(ftname,opname); \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
cntx_t* cntx, \
gemv_t* cntl \
) \
{ \
num_t dt = bli_obj_dt( a ); \
\
trans_t transa = bli_obj_conjtrans_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
\
dim_t m = bli_obj_length( a ); \
dim_t n = bli_obj_width( a ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \
\
PASTECH(ftname,_vft) f = PASTECH(opname,_vfp)[dt]; \
\
/* Invoke the void pointer-based function for the given datatype. */ \
f( \
transa, \
conjx, \
m, \
n, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
buf_beta, \
buf_y, incy, \
cntx \
); \
} \
GENFRONT( gemv, gemv_unb_var1 )
GENFRONT( gemv, gemv_unb_var2 )
GENFRONT( gemv, gemv_unf_var1 )
GENFRONT( gemv, gemv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/ 0000775 0000000 0000000 00000000000 14634250137 0021141 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_blk_var1.c 0000664 0000000 0000000 00000007406 14634250137 0024661 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemv_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
gemv_t* cntl )
{
obj_t a1, a1_pack;
obj_t y1, y1_pack;
dim_t m_trans;
dim_t i;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( a );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and y1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
i, b_alg, y, &y1 );
// Initialize objects for packing A1 and y1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y( cntl ) );
// Copy/pack A1, y1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y( cntl ) );
// y1 = beta * y1 + alpha * A1 * x;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a1_pack,
x,
beta,
&y1_pack,
cntx,
bli_cntl_sub_gemv( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, bli_cntl_sub_unpackv_y( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_blk_var2.c 0000664 0000000 0000000 00000007401 14634250137 0024655 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemv_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
gemv_t* cntl )
{
obj_t a1, a1_pack;
obj_t x1, x1_pack;
dim_t n_trans;
dim_t i;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( a );
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition along the "k" dimension (n dimension of A).
for ( i = 0; i < n_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and x1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
i, b_alg, x, &x1 );
// Initialize objects for packing A1 and x1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x( cntl ) );
// Copy/pack A1, x1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x( cntl ) );
// y = y + alpha * A1 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a1_pack,
&x1_pack,
&BLIS_ONE,
y,
cntx,
bli_cntl_sub_gemv( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_cntl.c 0000664 0000000 0000000 00000020524 14634250137 0024114 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern scalv_t* scalv_cntl;
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackv_t* unpackv_cntl;
gemv_t* gemv_cntl_bs_ke_dot = NULL;
gemv_t* gemv_cntl_bs_ke_axpy = NULL;
gemv_t* gemv_cntl_rp_bs_dot = NULL;
gemv_t* gemv_cntl_rp_bs_axpy = NULL;
gemv_t* gemv_cntl_cp_bs_dot = NULL;
gemv_t* gemv_cntl_cp_bs_axpy = NULL;
gemv_t* gemv_cntl_ge_dot = NULL;
gemv_t* gemv_cntl_ge_axpy = NULL;
void bli_gemv_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (persumably) relatively small block-subvector problems.
gemv_cntl_bs_ke_dot
=
bli_gemv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL,
NULL, NULL, NULL );
gemv_cntl_bs_ke_axpy
=
bli_gemv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT2,
0,
NULL, NULL, NULL,
NULL, NULL, NULL );
// Create control trees for problems with relatively small m dimension
// (ie: where trans(A) is a row panel problem).
gemv_cntl_rp_bs_dot
=
bli_gemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_N2,
scalv_cntl, // scale y up-front
packm_cntl, // pack A1 (if needed)
packv_cntl, // pack x1 (if needed)
NULL, // y is not partitioned in var2
gemv_cntl_bs_ke_dot,
NULL ); // y is not partitioned in var2
gemv_cntl_rp_bs_axpy
=
bli_gemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_N2,
scalv_cntl, // scale y up-front
packm_cntl, // pack A1 (if needed)
packv_cntl, // pack x1 (if needed)
NULL, // y is not partitioned in var2
gemv_cntl_bs_ke_axpy,
NULL ); // y is not partitioned in var2
// Create control trees for problems with relatively small n dimension
// (ie: where trans(A) is a column panel problem).
gemv_cntl_cp_bs_dot
=
bli_gemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
NULL, // no scaling in blk_var1
packm_cntl, // pack A1 (if needed)
NULL, // x is not partitioned in var1
packv_cntl, // pack y1 (if needed)
gemv_cntl_bs_ke_dot,
unpackv_cntl ); // unpack y1 (if packed)
gemv_cntl_cp_bs_axpy
=
bli_gemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
NULL, // no scaling in blk_var1
packm_cntl, // pack A1 (if needed)
NULL, // x is not partitioned in var1
packv_cntl, // pack y1 (if needed)
gemv_cntl_bs_ke_axpy,
unpackv_cntl ); // unpack y1 (if packed)
// Create control trees for generally large problems. Here, we choose a
// variant that partitions subproblems into row panels.
gemv_cntl_ge_dot
=
bli_gemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
NULL, // no scaling in blk_var1
NULL, // do not pack A1
NULL, // x is not partitioned in var1
packv_cntl, // pack y1 (if needed)
gemv_cntl_rp_bs_dot,
unpackv_cntl ); // unpack y1 (if packed)
gemv_cntl_ge_axpy
=
bli_gemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
NULL, // no scaling in blk_var1
NULL, // do not pack A1
NULL, // x is not partitioned in var1
packv_cntl, // pack y1 (if needed)
gemv_cntl_rp_bs_axpy,
unpackv_cntl ); // unpack y1 (if packed)
}
void bli_gemv_cntl_finalize()
{
bli_cntl_free_node( gemv_cntl_bs_ke_dot );
bli_cntl_free_node( gemv_cntl_bs_ke_axpy );
bli_cntl_free_node( gemv_cntl_rp_bs_dot );
bli_cntl_free_node( gemv_cntl_rp_bs_axpy );
bli_cntl_free_node( gemv_cntl_cp_bs_dot );
bli_cntl_free_node( gemv_cntl_cp_bs_axpy );
bli_cntl_free_node( gemv_cntl_ge_dot );
bli_cntl_free_node( gemv_cntl_ge_axpy );
}
gemv_t* bli_gemv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
gemv_t* sub_gemv,
unpackv_t* sub_unpackv_y )
{
gemv_t* cntl;
cntl = ( gemv_t* ) bli_malloc_intl( sizeof(gemv_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_scalv = sub_scalv;
cntl->sub_packm_a = sub_packm_a;
cntl->sub_packv_x = sub_packv_x;
cntl->sub_packv_y = sub_packv_y;
cntl->sub_gemv = sub_gemv;
cntl->sub_unpackv_y = sub_unpackv_y;
return cntl;
}
void bli_gemv_cntl_obj_init( gemv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
gemv_t* sub_gemv,
unpackv_t* sub_unpackv_y )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_scalv = sub_scalv;
cntl->sub_packm_a = sub_packm_a;
cntl->sub_packv_x = sub_packv_x;
cntl->sub_packv_y = sub_packv_y;
cntl->sub_gemv = sub_gemv;
cntl->sub_unpackv_y = sub_unpackv_y;
}
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_cntl.h 0000664 0000000 0000000 00000006672 14634250137 0024131 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct gemv_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct scalv_s* sub_scalv;
struct packm_s* sub_packm_a;
struct packv_s* sub_packv_x;
struct packv_s* sub_packv_y;
struct gemv_s* sub_gemv;
struct unpackv_s* sub_unpackv_y;
};
typedef struct gemv_s gemv_t;
#define bli_cntl_sub_gemv( cntl ) cntl->sub_gemv
#define bli_cntl_sub_gemv_rp( cntl ) cntl->sub_gemv_rp
#define bli_cntl_sub_gemv_cp( cntl ) cntl->sub_gemv_cp
#define bli_cntl_sub_gemv_n_rp( cntl ) cntl->sub_gemv_n_rp
#define bli_cntl_sub_gemv_n_cp( cntl ) cntl->sub_gemv_n_cp
#define bli_cntl_sub_gemv_t_rp( cntl ) cntl->sub_gemv_t_rp
#define bli_cntl_sub_gemv_t_cp( cntl ) cntl->sub_gemv_t_cp
void bli_gemv_cntl_init( void );
void bli_gemv_cntl_finalize( void );
gemv_t* bli_gemv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
gemv_t* sub_gemv,
unpackv_t* sub_unpackv_y );
void bli_gemv_cntl_obj_init( gemv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
gemv_t* sub_gemv,
unpackv_t* sub_unpackv_y );
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_front.c 0000664 0000000 0000000 00000016672 14634250137 0024315 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern gemv_t* gemv_cntl_bs_ke_axpy;
extern gemv_t* gemv_cntl_bs_ke_dot;
extern gemv_t* gemv_cntl_ge_axpy;
extern gemv_t* gemv_cntl_ge_dot;
void bli_gemv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx
)
{
gemv_t* gemv_cntl;
num_t dt_targ_a;
num_t dt_targ_x;
num_t dt_targ_y;
bool a_has_unit_inc;
bool x_has_unit_inc;
bool y_has_unit_inc;
obj_t alpha_local;
obj_t beta_local;
num_t dt_alpha;
num_t dt_beta;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemv_check( alpha, a, x, beta, y );
// Query the target datatypes of each object.
dt_targ_a = bli_obj_target_dt( a );
dt_targ_x = bli_obj_target_dt( x );
dt_targ_y = bli_obj_target_dt( y );
// Determine whether each operand is stored with unit stride.
a_has_unit_inc = ( bli_obj_is_row_stored( a ) ||
bli_obj_is_col_stored( a ) );
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of a and x to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of y. Here's why: If y is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*y will not be stored. If y is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = dt_targ_y;
bli_obj_scalar_init_detached_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( a_has_unit_inc &&
x_has_unit_inc &&
y_has_unit_inc )
{
// A row-major layout with no transpose is typically best served by
// a dot-based implementation (and the same goes for a column-major
// layout with a transposition) because it engenders unit stride
// within matrix A. Similarly, an axpy-based code is better for
// row-major cases with a transpose and column-major without a
// transpose. For the general stride case, we mimic that of column-
// major storage since that is the format into which we copy/pack.
if ( bli_obj_has_notrans( a ) )
{
if ( bli_obj_is_row_stored( a ) ) gemv_cntl = gemv_cntl_bs_ke_dot;
else gemv_cntl = gemv_cntl_bs_ke_axpy;
}
else // if ( bli_obj_has_trans( a ) )
{
if ( bli_obj_is_row_stored( a ) ) gemv_cntl = gemv_cntl_bs_ke_axpy;
else gemv_cntl = gemv_cntl_bs_ke_dot;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a );
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_has_notrans( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) gemv_cntl = gemv_cntl_ge_dot;
else gemv_cntl = gemv_cntl_ge_axpy;
}
else // if ( bli_obj_has_trans( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) gemv_cntl = gemv_cntl_ge_axpy;
else gemv_cntl = gemv_cntl_ge_dot;
}
}
// Invoke the internal back-end with the copy-casts of scalars and the
// chosen control tree.
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_TRANSPOSE,
&alpha_local,
a,
x,
&beta_local,
y,
cntx,
gemv_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, xo, betao, yo; \
\
dim_t m_a, n_a; \
dim_t m_x; \
dim_t m_y; \
inc_t rs_x, cs_x; \
inc_t rs_y, cs_y; \
\
bli_set_dims_with_trans( BLIS_NO_TRANSPOSE, m, n, &m_a, &n_a ); \
bli_set_dims_with_trans( transa, m, n, &m_y, &m_x ); \
\
rs_x = incx; cs_x = m_x * incx; \
rs_y = incy; cs_y = m_y * incy; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
\
bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, m_x, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m_y, 1, y, rs_y, cs_y, &yo ); \
\
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conj( conjx, &xo ); \
\
PASTEMAC0(opname)( &alphao, \
&ao, \
&xo, \
&betao, \
&yo, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( gemv_front )
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_front.h 0000664 0000000 0000000 00000004272 14634250137 0024313 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
trans_t transa, \
conj_t conjx, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( gemv_front )
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_int.c 0000664 0000000 0000000 00000007033 14634250137 0023746 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemv_fp
typedef void (*FUNCPTR_T)( obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
gemv_t* cntl );
static FUNCPTR_T vars[3][3] =
{
// unblocked unblocked with fusing blocked
{ bli_gemv_unb_var1, bli_gemv_unf_var1, bli_gemv_blk_var1 },
{ bli_gemv_unb_var2, bli_gemv_unf_var2, bli_gemv_blk_var2 },
{ NULL, NULL, NULL },
};
void bli_gemv_int( trans_t transa,
conj_t conjx,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
gemv_t* cntl )
{
varnum_t n;
impl_t i;
FUNCPTR_T f;
obj_t a_local;
obj_t x_local;
// Apply the trans and/or conj parameters to aliases of the objects.
bli_obj_alias_with_trans( transa, a, &a_local );
bli_obj_alias_with_conj( conjx, x, &x_local );
// Check parameters. We use the aliased copy of A so the transa parameter
// is taken into account for dimension checking.
if ( bli_error_checking_is_enabled() )
bli_gemv_check( alpha, &a_local, &x_local, beta, y );
// If y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( y ) ) return;
// If x has a zero dimension, scale y by beta and return early.
if ( bli_obj_has_zero_dim( x ) )
{
bli_scalm( beta, y );
return;
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( alpha,
&a_local,
&x_local,
beta,
y,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/gemv/other/bli_gemv_int.h 0000664 0000000 0000000 00000003541 14634250137 0023753 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemv_int
(
trans_t transa,
conj_t conjx,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
gemv_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/2/ger/ 0000775 0000000 0000000 00000000000 14634250137 0017637 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/ger/bli_ger.h 0000664 0000000 0000000 00000003463 14634250137 0021421 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_ger_cntl.h"
//#include "bli_ger_front.h"
//#include "bli_ger_int.h"
#include "bli_ger_var.h"
cython-blis-1.0.0/blis/_src/frame/2/ger/bli_ger_unb_var1.c 0000664 0000000 0000000 00000005321 14634250137 0023204 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a1t; \
ctype* chi1; \
ctype* y1; \
ctype alpha_chi1; \
dim_t i; \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
a1t = a + (i )*rs_a + (0 )*cs_a; \
chi1 = x + (i )*incx; \
y1 = y + (0 )*incy; \
\
/* a1t = a1t + alpha * chi1 * y; */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \
PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \
\
kfp_av \
( \
conjy, \
n, \
&alpha_chi1, \
y1, incy, \
a1t, cs_a, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( ger_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/ger/bli_ger_unb_var2.c 0000664 0000000 0000000 00000005314 14634250137 0023207 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a1; \
ctype* x1; \
ctype* psi1; \
ctype alpha_psi1; \
dim_t j; \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( j = 0; j < n; ++j ) \
{ \
a1 = a + (0 )*rs_a + (j )*cs_a; \
x1 = x + (0 )*incx; \
psi1 = y + (j )*incy; \
\
/* a1 = a1 + alpha * psi1 * x; */ \
PASTEMAC(ch,copycjs)( conjy, *psi1, alpha_psi1 ); \
PASTEMAC(ch,scals)( *alpha, alpha_psi1 ); \
\
kfp_av \
( \
conjx, \
m, \
&alpha_psi1, \
x1, incx, \
a1, rs_a, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( ger_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/ger/bli_ger_var.h 0000664 0000000 0000000 00000004737 14634250137 0022276 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* a, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( ger_blk_var1 )
GENPROT( ger_blk_var2 )
GENPROT( ger_unb_var1 )
GENPROT( ger_unb_var2 )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( ger_unb_var1 )
INSERT_GENTPROT_BASIC0( ger_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/ger/bli_ger_var_oapi.c 0000664 0000000 0000000 00000005722 14634250137 0023274 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
obj_t* alpha, \
obj_t* x, \
obj_t* y, \
obj_t* a, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( a ); \
\
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
\
dim_t m = bli_obj_length( a ); \
dim_t n = bli_obj_width( a ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
conjx, \
conjy, \
m, \
n, \
buf_alpha, \
buf_x, incx, \
buf_y, incy, \
buf_a, rs_a, cs_a, \
cntx \
); \
} \
GENFRONT( ger, ger_unb_var1 )
GENFRONT( ger, ger_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/ger/other/ 0000775 0000000 0000000 00000000000 14634250137 0020760 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_blk_var1.c 0000664 0000000 0000000 00000007345 14634250137 0024321 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_ger_blk_var1( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx,
ger_t* cntl )
{
obj_t a1, a1_pack;
obj_t x1, x1_pack;
dim_t i;
dim_t b_alg;
dim_t m_trans;
// Initialize objects for packing.
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension in partitioning direction.
m_trans = bli_obj_length_after_trans( a );
// Partition along the m dimension.
for ( i = 0; i < m_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, m_trans, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and x1.
bli_acquire_mpart_t2b( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
i, b_alg, x, &x1 );
// Initialize objects for packing A1 and x1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x( cntl ) );
// Copy/pack A1, x1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x( cntl ) );
// A1 = A1 + alpha * x1 * y;
bli_ger_int( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
alpha,
&x1_pack,
y,
&a1_pack,
cntx,
bli_cntl_sub_ger( cntl ) );
// Copy/unpack A1 (if A1 was packed).
bli_unpackm_int( &a1_pack, &a1,
cntx, bli_cntl_sub_unpackm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_blk_var2.c 0000664 0000000 0000000 00000007344 14634250137 0024321 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_ger_blk_var2( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx,
ger_t* cntl )
{
obj_t a1, a1_pack;
obj_t y1, y1_pack;
dim_t i;
dim_t b_alg;
dim_t n_trans;
// Initialize objects for packing.
bli_obj_init_pack( &a1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension in partitioning direction.
n_trans = bli_obj_width_after_trans( a );
// Partition along the n dimension.
for ( i = 0; i < n_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( i, n_trans, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and y1.
bli_acquire_mpart_l2r( BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
i, b_alg, y, &y1 );
// Initialize objects for packing A1 and y1 (if needed).
bli_packm_init( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y( cntl ) );
// Copy/pack A1, y1 (if needed).
bli_packm_int( &a1, &a1_pack,
cntx, bli_cntl_sub_packm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y( cntl ) );
// A1 = A1 + alpha * x * y1;
bli_ger_int( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
alpha,
x,
&y1_pack,
&a1_pack,
cntx,
bli_cntl_sub_ger( cntl ) );
// Copy/unpack A1 (if A1 was packed).
bli_unpackm_int( &a1_pack, &a1,
cntx, bli_cntl_sub_unpackm_a( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_cntl.c 0000664 0000000 0000000 00000017104 14634250137 0023552 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackm_t* unpackm_cntl;
ger_t* ger_cntl_bs_ke_row = NULL;
ger_t* ger_cntl_bs_ke_col = NULL;
ger_t* ger_cntl_rp_bs_row = NULL;
ger_t* ger_cntl_rp_bs_col = NULL;
ger_t* ger_cntl_cp_bs_row = NULL;
ger_t* ger_cntl_cp_bs_col = NULL;
ger_t* ger_cntl_ge_row = NULL;
ger_t* ger_cntl_ge_col = NULL;
void bli_ger_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (persumably) relatively small block-subvector problems.
ger_cntl_bs_ke_row
=
bli_ger_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL,
NULL, NULL );
ger_cntl_bs_ke_col
=
bli_ger_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT2,
0,
NULL, NULL, NULL,
NULL, NULL );
// Create control trees for problems with relatively small m dimension
// (ie: where A is a row panel problem).
ger_cntl_rp_bs_row
=
bli_ger_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_N2,
NULL, // x is not partitioned in var2
packv_cntl, // pack y1 (if needed)
packm_cntl, // pack A1 (if needed)
ger_cntl_bs_ke_row,
unpackm_cntl ); // unpack A1 (if packed)
ger_cntl_rp_bs_col
=
bli_ger_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_N2,
NULL, // x is not partitioned in var2
packv_cntl, // pack y1 (if needed)
packm_cntl, // pack A1 (if needed)
ger_cntl_bs_ke_col,
unpackm_cntl ); // unpack A1 (if packed)
// Create control trees for problems with relatively small n dimension
// (ie: where A is a column panel problem).
ger_cntl_cp_bs_row
=
bli_ger_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
packv_cntl, // pack x1 (if needed)
NULL, // y is not partitioned in var1
packm_cntl, // pack A1 (if needed)
ger_cntl_bs_ke_row,
unpackm_cntl ); // unpack A1 (if packed)
ger_cntl_cp_bs_col
=
bli_ger_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
packv_cntl, // pack x1 (if needed)
NULL, // y is not partitioned in var1
packm_cntl, // pack A1 (if needed)
ger_cntl_bs_ke_col,
unpackm_cntl ); // unpack A1 (if packed)
// Create control trees for generally large problems. Here, we choose a
// variant that partitions subproblems into column panels.
ger_cntl_ge_row
=
bli_ger_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_N2,
NULL, // x is not partitioned in var2
packv_cntl, // pack y1 (if needed)
NULL, // do not pack A1
ger_cntl_cp_bs_row,
NULL ); // do not unpack A1
ger_cntl_ge_col
=
bli_ger_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_N2,
NULL, // x is not partitioned in var2
packv_cntl, // pack y1 (if needed)
NULL, // do not pack A1
ger_cntl_cp_bs_col,
NULL ); // do not unpack A1
}
void bli_ger_cntl_finalize()
{
bli_cntl_free_node( ger_cntl_bs_ke_row );
bli_cntl_free_node( ger_cntl_bs_ke_col );
bli_cntl_free_node( ger_cntl_rp_bs_row );
bli_cntl_free_node( ger_cntl_rp_bs_col );
bli_cntl_free_node( ger_cntl_cp_bs_row );
bli_cntl_free_node( ger_cntl_cp_bs_col );
bli_cntl_free_node( ger_cntl_ge_row );
bli_cntl_free_node( ger_cntl_ge_col );
}
ger_t* bli_ger_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
packm_t* sub_packm_a,
ger_t* sub_ger,
unpackm_t* sub_unpackm_a )
{
ger_t* cntl;
cntl = ( ger_t* ) bli_malloc_intl( sizeof(ger_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packv_x = sub_packv_x;
cntl->sub_packv_y = sub_packv_y;
cntl->sub_packm_a = sub_packm_a;
cntl->sub_ger = sub_ger;
cntl->sub_unpackm_a = sub_unpackm_a;
return cntl;
}
void bli_ger_cntl_obj_init( ger_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
packm_t* sub_packm_a,
ger_t* sub_ger,
unpackm_t* sub_unpackm_a )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packv_x = sub_packv_x;
cntl->sub_packv_y = sub_packv_y;
cntl->sub_packm_a = sub_packm_a;
cntl->sub_ger = sub_ger;
cntl->sub_unpackm_a = sub_unpackm_a;
}
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_cntl.h 0000664 0000000 0000000 00000006025 14634250137 0023557 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct ger_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct packv_s* sub_packv_x;
struct packv_s* sub_packv_y;
struct packm_s* sub_packm_a;
struct ger_s* sub_ger;
struct unpackm_s* sub_unpackm_a;
};
typedef struct ger_s ger_t;
#define bli_cntl_sub_ger( cntl ) cntl->sub_ger
#define bli_cntl_sub_ger_rp( cntl ) cntl->sub_ger_rp
#define bli_cntl_sub_ger_cp( cntl ) cntl->sub_ger_cp
void bli_ger_cntl_init( void );
void bli_ger_cntl_finalize( void );
ger_t* bli_ger_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
packm_t* sub_packm_a,
ger_t* sub_ger,
unpackm_t* sub_unpackm_a );
void bli_ger_cntl_obj_init( ger_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x,
packv_t* sub_packv_y,
packm_t* sub_packm_a,
ger_t* sub_ger,
unpackm_t* sub_unpackm_a );
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_front.c 0000664 0000000 0000000 00000013226 14634250137 0023743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern ger_t* ger_cntl_bs_ke_row;
extern ger_t* ger_cntl_bs_ke_col;
extern ger_t* ger_cntl_ge_row;
extern ger_t* ger_cntl_ge_col;
void bli_ger_front
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx
)
{
ger_t* ger_cntl;
num_t dt_targ_x;
num_t dt_targ_y;
//num_t dt_targ_a;
bool x_has_unit_inc;
bool y_has_unit_inc;
bool a_has_unit_inc;
obj_t alpha_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_ger_check( alpha, x, y, a );
// Query the target datatypes of each object.
dt_targ_x = bli_obj_target_dt( x );
dt_targ_y = bli_obj_target_dt( y );
//dt_targ_a = bli_obj_target_dt( a );
// Determine whether each operand with unit stride.
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 );
a_has_unit_inc = ( bli_obj_is_row_stored( a ) ||
bli_obj_is_col_stored( a ) );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of x and y to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( x_has_unit_inc &&
y_has_unit_inc &&
a_has_unit_inc )
{
// Use different control trees depending on storage of the matrix
// operand.
if ( bli_obj_is_row_stored( a ) ) ger_cntl = ger_cntl_bs_ke_row;
else ger_cntl = ger_cntl_bs_ke_col;
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y );
if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_row_tilted( a ) ) ger_cntl = ger_cntl_ge_row;
else ger_cntl = ger_cntl_ge_col;
}
// Invoke the internal back-end with the copy-cast scalar and the
// chosen control tree.
bli_ger_int( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
&alpha_local,
x,
y,
a,
cntx,
ger_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, xo, yo, ao; \
\
dim_t m_x; \
dim_t m_y; \
inc_t rs_x, cs_x; \
inc_t rs_y, cs_y; \
\
bli_set_dims_with_trans( BLIS_NO_TRANSPOSE, m, n, &m_x, &m_y ); \
\
rs_x = incx; cs_x = m_x * incx; \
rs_y = incy; cs_y = m_y * incy; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m_x, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m_y, 1, y, rs_y, cs_y, &yo ); \
bli_obj_create_with_attached_buffer( dt, m, n, a, rs_a, cs_a, &ao ); \
\
bli_obj_set_conj( conjx, &xo ); \
bli_obj_set_conj( conjy, &yo ); \
\
PASTEMAC0(opname)( &alphao, \
&xo, \
&yo, \
&ao, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( ger_front )
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_front.h 0000664 0000000 0000000 00000004213 14634250137 0023744 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_ger_front
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* a, inc_t rs_a, inc_t cs_a, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( ger_front )
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_int.c 0000664 0000000 0000000 00000010516 14634250137 0023404 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T ger_fp
typedef void (*FUNCPTR_T)( obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx,
ger_t* cntl );
static FUNCPTR_T vars[4][3] =
{
// unblocked unblocked with fusing blocked
{ bli_ger_unb_var1, NULL, bli_ger_blk_var1, },
{ bli_ger_unb_var2, NULL, bli_ger_blk_var2, },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },
};
void bli_ger_int( conj_t conjx,
conj_t conjy,
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx,
ger_t* cntl )
{
varnum_t n;
impl_t i;
FUNCPTR_T f;
obj_t alpha_local;
obj_t x_local;
obj_t y_local;
obj_t a_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_ger_check( alpha, x, y, a );
// If A has a zero dimension, return early.
if ( bli_obj_has_zero_dim( a ) ) return;
// If x or y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( x ) ||
bli_obj_has_zero_dim( y ) ) return;
// Alias the objects, applying conjx and conjy to x and y, respectively.
bli_obj_alias_with_conj( conjx, x, &x_local );
bli_obj_alias_with_conj( conjy, y, &y_local );
bli_obj_alias_to( a, &a_local );
// If matrix A is marked for conjugation, we interpret this as a request
// to apply a conjugation to the other operands.
if ( bli_obj_has_conj( &a_local ) )
{
bli_obj_toggle_conj( &a_local );
bli_obj_toggle_conj( &x_local );
bli_obj_toggle_conj( &y_local );
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ),
BLIS_CONJUGATE,
alpha,
&alpha_local );
}
else
{
bli_obj_alias_to( *alpha, alpha_local );
}
// If we are about the call a leaf-level implementation, and matrix A
// still needs a transposition, then we must induce one by swapping the
// strides and dimensions.
if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( &a_local ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( &alpha_local,
&x_local,
&y_local,
&a_local,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/ger/other/bli_ger_int.h 0000664 0000000 0000000 00000003606 14634250137 0023413 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_ger_int( conj_t conjx,
conj_t conjy,
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* a,
cntx_t* cntx,
ger_t* cntl );
cython-blis-1.0.0/blis/_src/frame/2/hemv/ 0000775 0000000 0000000 00000000000 14634250137 0020021 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv.h 0000664 0000000 0000000 00000003470 14634250137 0021763 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_hemv_cntl.h"
//#include "bli_hemv_front.h"
//#include "bli_hemv_int.h"
#include "bli_hemv_var.h"
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unb_var1.c 0000664 0000000 0000000 00000011411 14634250137 0023545 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* a10t; \
ctype* alpha11; \
ctype* x0; \
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype conjx_chi1; \
ctype alpha_chi1; \
ctype alpha11_temp; \
dim_t i; \
dim_t n_behind; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointers. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
a10t = a + (i )*rs_at + (0 )*cs_at; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
\
/* Apply conjx to chi1 and and scale by alpha. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
\
/* y0 = y0 + alpha * a10t' * chi1; */ \
kfp_av \
( \
conj0, \
n_behind, \
&alpha_chi1, \
a10t, cs_at, \
y0, incy, \
cntx \
); \
\
/* psi1 = psi1 + alpha * a10t * x0; */ \
kfp_dv \
( \
conj1, \
conjx, \
n_behind, \
alpha, \
a10t, cs_at, \
x0, incx, \
one, \
psi1, \
cntx \
); \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
\
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unb_var2.c 0000664 0000000 0000000 00000011473 14634250137 0023556 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* a10t; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype conjx_chi1; \
ctype alpha_chi1; \
ctype alpha11_temp; \
dim_t i; \
dim_t n_behind; \
dim_t n_ahead; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointer. */ \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
n_ahead = m - i - 1; \
a10t = a + (i )*rs_at + (0 )*cs_at; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a21 = a + (i+1)*rs_at + (i )*cs_at; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
\
/* Apply conjx to chi1 and and scale by alpha. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
\
/* psi1 = psi1 + alpha * a10t * x0; */ \
kfp_dv \
( \
conj0, \
conjx, \
n_behind, \
alpha, \
a10t, cs_at, \
x0, incx, \
one, \
psi1, \
cntx \
); \
\
/* psi1 = psi1 + alpha * a21' * x2; */ \
kfp_dv \
( \
conj1, \
conjx, \
n_ahead, \
alpha, \
a21, rs_at, \
x2, incx, \
one, \
psi1, \
cntx \
); \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unb_var3.c 0000664 0000000 0000000 00000011405 14634250137 0023552 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* alpha11; \
ctype* a21; \
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype* y2; \
ctype conjx_chi1; \
ctype alpha_chi1; \
ctype alpha11_temp; \
dim_t i; \
dim_t n_ahead; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
PASTECH(ch,dotxv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointers. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_ahead = m - i - 1; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a21 = a + (i+1)*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
\
/* Apply conjx to chi1 and and scale by alpha. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
\
/* psi1 = psi1 + alpha * a21' * x2; */ \
kfp_dv \
( \
conj0, \
conjx, \
n_ahead, \
alpha, \
a21, rs_at, \
x2, incx, \
one, \
psi1, \
cntx \
); \
\
/* y2 = y2 + alpha * a21 * chi1; */ \
kfp_av \
( \
conj1, \
n_ahead, \
&alpha_chi1, \
a21, rs_at, \
y2, incy, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unb_var3 )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unb_var4.c 0000664 0000000 0000000 00000011302 14634250137 0023547 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* a10t; \
ctype* alpha11; \
ctype* a21; \
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype* y2; \
ctype conjx_chi1; \
ctype alpha_chi1; \
ctype alpha11_temp; \
dim_t i; \
dim_t n_behind; \
dim_t n_ahead; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointers. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
n_ahead = m - i - 1; \
a10t = a + (i )*rs_at + (0 )*cs_at; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a21 = a + (i+1)*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
\
/* Apply conjx to chi1 and and scale by alpha. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
\
/* y0 = y0 + alpha * a10t' * chi1; */ \
kfp_av \
( \
conj0, \
n_behind, \
&alpha_chi1, \
a10t, cs_at, \
y0, incy, \
cntx \
); \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
\
/* y2 = y2 + alpha * a21 * chi1; */ \
kfp_av \
( \
conj1, \
n_ahead, \
&alpha_chi1, \
a21, rs_at, \
y2, incy, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unb_var4 )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unf_var1.c 0000664 0000000 0000000 00000014003 14634250137 0023551 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* A10; \
ctype* A11; \
ctype* a10t; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* x1; \
ctype* chi11; \
ctype* y0; \
ctype* y1; \
ctype* y01; \
ctype* psi11; \
ctype* y21; \
ctype conjx_chi11; \
ctype alpha_chi11; \
ctype alpha11_temp; \
dim_t i, k, j; \
dim_t b_fuse, f; \
dim_t n_behind; \
dim_t f_ahead, f_behind; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
\
for ( i = 0; i < m; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
n_behind = i; \
A10 = a + (i )*rs_at + (0 )*cs_at; \
A11 = a + (i )*rs_at + (i )*cs_at; \
x0 = x + (0 )*incx; \
x1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
y1 = y + (i )*incy; \
\
/* y1 = y1 + alpha * A10 * x0; (dotxf) */ \
/* y0 = y0 + alpha * A10' * x1; (axpyf) */ \
kfp_xf \
( \
conj0, \
conj1, \
conjx, \
conjx, \
n_behind, \
f, \
alpha, \
A10, cs_at, rs_at, \
x0, incx, \
x1, incx, \
one, \
y1, incy, \
y0, incy, \
cntx \
); \
\
/* y1 = y1 + alpha * A11 * x1; (variant 4) */ \
for ( k = 0; k < f; ++k ) \
{ \
f_behind = k; \
f_ahead = f - k - 1; \
a10t = A11 + (k )*rs_at + (0 )*cs_at; \
alpha11 = A11 + (k )*rs_at + (k )*cs_at; \
a21 = A11 + (k+1)*rs_at + (k )*cs_at; \
chi11 = x1 + (k )*incx; \
y01 = y1 + (0 )*incy; \
psi11 = y1 + (k )*incy; \
y21 = y1 + (k+1)*incy; \
\
/* y01 = y01 + alpha * a10t' * chi11; */ \
PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
if ( bli_is_conj( conj1 ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
\
/* y21 = y21 + alpha * a21 * chi11; */ \
if ( bli_is_conj( conj0 ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unf_var1 )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unf_var1a.c 0000664 0000000 0000000 00000011167 14634250137 0023722 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* a10t; \
ctype* alpha11; \
ctype* x0; \
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype rho; \
ctype conjx_chi1; \
ctype alpha_chi1; \
ctype alpha11_temp; \
dim_t i; \
dim_t n_behind; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
\
/* Query the context for the kernel function pointer. */ \
kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
a10t = a + (i )*rs_at + (0 )*cs_at; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
\
/* Apply conjx to chi1 and and scale by alpha. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
\
/* psi1 = psi1 + alpha * a10t * x0; (dotv) */ \
/* y0 = y0 + alpha * a10t' * chi1; (axpyv) */ \
kfp_vf \
( \
conj0, \
conj1, \
conjx, \
n_behind, \
&alpha_chi1, \
a10t, cs_at, \
x0, incx, \
&rho, \
y0, incy, \
cntx \
); \
PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
\
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unf_var1a )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unf_var3.c 0000664 0000000 0000000 00000014011 14634250137 0023552 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* zero = PASTEMAC(ch,0); \
ctype* A11; \
ctype* A21; \
ctype* a10t; \
ctype* alpha11; \
ctype* a21; \
ctype* x1; \
ctype* x2; \
ctype* chi11; \
ctype* y1; \
ctype* y2; \
ctype* y01; \
ctype* psi11; \
ctype* y21; \
ctype conjx_chi11; \
ctype alpha_chi11; \
ctype alpha11_temp; \
dim_t i, k, j; \
dim_t b_fuse, f; \
dim_t n_ahead; \
dim_t f_ahead, f_behind; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
\
for ( i = 0; i < m; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \
n_ahead = m - i - f; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A21 = a + (i+f)*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
y1 = y + (i )*incy; \
y2 = y + (i+f)*incy; \
\
/* y1 = y1 + alpha * A11 * x1; (variant 4) */ \
for ( k = 0; k < f; ++k ) \
{ \
f_behind = k; \
f_ahead = f - k - 1; \
a10t = A11 + (k )*rs_at + (0 )*cs_at; \
alpha11 = A11 + (k )*rs_at + (k )*cs_at; \
a21 = A11 + (k+1)*rs_at + (k )*cs_at; \
chi11 = x1 + (k )*incx; \
y01 = y1 + (0 )*incy; \
psi11 = y1 + (k )*incy; \
y21 = y1 + (k+1)*incy; \
\
/* y01 = y01 + alpha * a10t' * chi11; */ \
PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \
if ( bli_is_conj( conj0 ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \
} \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* psi11 = psi11 + alpha * alpha11 * chi11; */ \
PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \
\
/* y21 = y21 + alpha * a21 * chi11; */ \
if ( bli_is_conj( conj1 ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \
} \
} \
\
/* y1 = y1 + alpha * A21' * x2; (dotxf) */ \
/* y2 = y2 + alpha * A21 * x1; (axpyf) */ \
kfp_xf \
( \
conj0, \
conj1, \
conjx, \
conjx, \
n_ahead, \
f, \
alpha, \
A21, rs_at, cs_at, \
x2, incx, \
x1, incx, \
one, \
y1, incy, \
y2, incy, \
cntx \
); \
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unf_var3 )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_unf_var3a.c 0000664 0000000 0000000 00000011163 14634250137 0023720 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* zero = PASTEMAC(ch,0); \
ctype* alpha11; \
ctype* a21; \
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype* y2; \
ctype rho; \
ctype conjx_chi1; \
ctype alpha_chi1; \
ctype alpha11_temp; \
dim_t i; \
dim_t n_ahead; \
inc_t rs_at, cs_at; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
\
conj0 = bli_apply_conj( conjh, conja ); \
conj1 = conja; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
\
conj0 = conja; \
conj1 = bli_apply_conj( conjh, conja ); \
} \
\
/* If beta is zero, use setv. Otherwise, scale by beta. */ \
if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
/* y = 0; */ \
PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
zero, \
y, incy, \
cntx, \
NULL \
); \
} \
else \
{ \
/* y = beta * y; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
beta, \
y, incy, \
cntx, \
NULL \
); \
} \
\
PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
\
/* Query the context for the kernel function pointer. */ \
kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_ahead = m - i - 1; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a21 = a + (i+1)*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
\
/* For hemv, explicitly set the imaginary component of alpha11 to
zero. */ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( alpha11_temp ); \
\
/* Apply conjx to chi1 and and scale by alpha. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \
PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \
\
/* psi1 = psi1 + alpha * alpha11 * chi1; */ \
PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \
\
/* psi1 = psi1 + alpha * a21' * x2; (dotv) */ \
/* y2 = y2 + alpha * a21 * chi1; (axpyv) */ \
kfp_vf \
( \
conj0, \
conj1, \
conjx, \
n_ahead, \
&alpha_chi1, \
a21, rs_at, \
x2, incx, \
&rho, \
y2, incy, \
cntx \
); \
PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \
} \
}
INSERT_GENTFUNC_BASIC0( hemv_unf_var3a )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_var.h 0000664 0000000 0000000 00000006004 14634250137 0022627 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
conj_t conjh, \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( hemv_blk_var1 )
GENPROT( hemv_blk_var2 )
GENPROT( hemv_blk_var3 )
GENPROT( hemv_blk_var4 )
GENPROT( hemv_unb_var1 )
GENPROT( hemv_unb_var2 )
GENPROT( hemv_unb_var3 )
GENPROT( hemv_unb_var4 )
GENPROT( hemv_unf_var1 )
GENPROT( hemv_unf_var3 )
GENPROT( hemv_unf_var1a )
GENPROT( hemv_unf_var3a )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conja, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( hemv_unb_var1 )
INSERT_GENTPROT_BASIC0( hemv_unb_var2 )
INSERT_GENTPROT_BASIC0( hemv_unb_var3 )
INSERT_GENTPROT_BASIC0( hemv_unb_var4 )
INSERT_GENTPROT_BASIC0( hemv_unf_var1 )
INSERT_GENTPROT_BASIC0( hemv_unf_var3 )
INSERT_GENTPROT_BASIC0( hemv_unf_var1a )
INSERT_GENTPROT_BASIC0( hemv_unf_var3a )
cython-blis-1.0.0/blis/_src/frame/2/hemv/bli_hemv_var_oapi.c 0000664 0000000 0000000 00000006442 14634250137 0023640 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
conj_t conjh, \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
obj_t* beta, \
obj_t* y, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uplo = bli_obj_uplo( a ); \
conj_t conja = bli_obj_conj_status( a ); \
conj_t conjx = bli_obj_conj_status( x ); \
\
dim_t m = bli_obj_length( a ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
uplo, \
conja, \
conjx, \
conjh, \
m, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
buf_beta, \
buf_y, incy, \
cntx \
); \
} \
GENFRONT( hemv, hemv_unb_var1 )
GENFRONT( hemv, hemv_unb_var2 )
GENFRONT( hemv, hemv_unb_var3 )
GENFRONT( hemv, hemv_unb_var4 )
GENFRONT( hemv, hemv_unf_var1 )
GENFRONT( hemv, hemv_unf_var3 )
GENFRONT( hemv, hemv_unf_var1a )
GENFRONT( hemv, hemv_unf_var3a )
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/ 0000775 0000000 0000000 00000000000 14634250137 0021142 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_blk_var1.c 0000664 0000000 0000000 00000013310 14634250137 0024652 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_hemv_blk_var1( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a10;
obj_t x1, x1_pack;
obj_t x0;
obj_t y1, y1_pack;
obj_t y0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( a );
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, x0, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, a, &a10 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, y, &y0 );
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y0 = y0 + alpha * A10' * x1;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
BLIS_NO_CONJUGATE,
alpha,
&a10,
&x1_pack,
&BLIS_ONE,
&y0,
cntx,
bli_cntl_sub_gemv_t_rp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
alpha,
&a11_pack,
&x1_pack,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_hemv( cntl ) );
// y1 = y1 + alpha * A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a10,
&x0,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_gemv_n_rp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_blk_var2.c 0000664 0000000 0000000 00000013473 14634250137 0024665 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_hemv_blk_var2( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a10;
obj_t a21;
obj_t x1, x1_pack;
obj_t x0;
obj_t x2;
obj_t y1, y1_pack;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( a );
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, A21, x1, x0, x2, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, a, &a10 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, a, &a21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y1 = y1 + alpha * A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a10,
&x0,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_gemv_n_rp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
alpha,
&a11_pack,
&x1_pack,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_hemv( cntl ) );
// y1 = y1 + alpha * A21' * x2;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
BLIS_NO_CONJUGATE,
alpha,
&a21,
&x2,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_gemv_t_cp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_blk_var3.c 0000664 0000000 0000000 00000013310 14634250137 0024654 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_hemv_blk_var3( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a21;
obj_t x1, x1_pack;
obj_t x2;
obj_t y1, y1_pack;
obj_t y2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( a );
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, x0, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, a, &a21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, y, &y2 );
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y1 = y1 + alpha * A21' * x2;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
BLIS_NO_CONJUGATE,
alpha,
&a21,
&x2,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_gemv_t_cp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
alpha,
&a11_pack,
&x1_pack,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_hemv( cntl ) );
// y2 = y2 + alpha * A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a21,
&x1_pack,
&BLIS_ONE,
&y2,
cntx,
bli_cntl_sub_gemv_n_cp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_blk_var4.c 0000664 0000000 0000000 00000013467 14634250137 0024672 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_hemv_blk_var4( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a10;
obj_t a21;
obj_t x1, x1_pack;
obj_t y1, y1_pack;
obj_t y0;
obj_t y2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( a );
// y = beta * y;
bli_scalv_int( beta,
y,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, A21, x1, y1, y0, and y2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, a, &a10 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, a, &a21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, y, &y0 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, y, &y2 );
// Initialize objects for packing A11, x1, and y1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack A11, x1, y1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// y0 = y0 + alpha * A10' * x1;
bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ),
BLIS_NO_CONJUGATE,
alpha,
&a10,
&x1_pack,
&BLIS_ONE,
&y0,
cntx,
bli_cntl_sub_gemv_t_rp( cntl ) );
// y1 = y1 + alpha * A11 * x1;
bli_hemv_int( conjh,
alpha,
&a11_pack,
&x1_pack,
&BLIS_ONE,
&y1_pack,
cntx,
bli_cntl_sub_hemv( cntl ) );
// y2 = y2 + alpha * A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a21,
&x1_pack,
&BLIS_ONE,
&y2,
cntx,
bli_cntl_sub_gemv_n_cp( cntl ) );
// Copy/unpack y1 (if y1 was packed).
bli_unpackv_int( &y1_pack, &y1,
cntx, bli_cntl_sub_unpackv_y1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_cntl.c 0000664 0000000 0000000 00000016644 14634250137 0024126 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern scalv_t* scalv_cntl;
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackv_t* unpackv_cntl;
extern gemv_t* gemv_cntl_rp_bs_dot;
extern gemv_t* gemv_cntl_rp_bs_axpy;
extern gemv_t* gemv_cntl_cp_bs_dot;
extern gemv_t* gemv_cntl_cp_bs_axpy;
hemv_t* hemv_cntl_bs_ke_lrow_ucol = NULL;
hemv_t* hemv_cntl_bs_ke_lcol_urow = NULL;
hemv_t* hemv_cntl_ge_lrow_ucol = NULL;
hemv_t* hemv_cntl_ge_lcol_urow = NULL;
void bli_hemv_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (presumably) relatively small block-subvector problems.
hemv_cntl_bs_ke_lrow_ucol
=
bli_hemv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL );
hemv_cntl_bs_ke_lcol_urow
=
bli_hemv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT3,
0,
NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL,
NULL, NULL );
// Create control trees for generally large problems. Here, we choose a
// variant that prioritizes keeping a subvector of y in cache.
hemv_cntl_ge_lrow_ucol
=
bli_hemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_M2,
scalv_cntl, // scale y up-front
packm_cntl, // pack A11 (if needed)
packv_cntl, // pack x1 (if needed)
packv_cntl, // pack y1 (if needed)
gemv_cntl_rp_bs_dot, // gemv_n_rp needed by var2
NULL, // gemv_n_cp not used by var2
NULL, // gemv_t_rp not used by var2
gemv_cntl_rp_bs_axpy, // gemv_t_cp needed by var2
hemv_cntl_bs_ke_lrow_ucol,
unpackv_cntl ); // unpack y1 (if packed)
hemv_cntl_ge_lcol_urow
=
bli_hemv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_M2,
scalv_cntl, // scale y up-front
packm_cntl, // pack A11 (if needed)
packv_cntl, // pack x1 (if needed)
packv_cntl, // pack y1 (if needed)
gemv_cntl_rp_bs_axpy, // gemv_n_rp needed by var2
NULL, // gemv_n_cp not used by var2
NULL, // gemv_t_rp not used by var2
gemv_cntl_rp_bs_dot, // gemv_t_cp needed by var2
hemv_cntl_bs_ke_lcol_urow,
unpackv_cntl ); // unpack y1 (if packed)
}
void bli_hemv_cntl_finalize()
{
bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol );
bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow );
bli_cntl_free_node( hemv_cntl_ge_lrow_ucol );
bli_cntl_free_node( hemv_cntl_ge_lcol_urow );
}
hemv_t* bli_hemv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
gemv_t* sub_gemv_n_rp,
gemv_t* sub_gemv_n_cp,
gemv_t* sub_gemv_t_rp,
gemv_t* sub_gemv_t_cp,
hemv_t* sub_hemv,
unpackv_t* sub_unpackv_y1 )
{
hemv_t* cntl;
cntl = ( hemv_t* ) bli_malloc_intl( sizeof(hemv_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_scalv = sub_scalv;
cntl->sub_packm_a11 = sub_packm_a11;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_packv_y1 = sub_packv_y1;
cntl->sub_gemv_n_rp = sub_gemv_n_rp;
cntl->sub_gemv_n_cp = sub_gemv_n_cp;
cntl->sub_gemv_t_rp = sub_gemv_t_rp;
cntl->sub_gemv_t_cp = sub_gemv_t_cp;
cntl->sub_hemv = sub_hemv;
cntl->sub_unpackv_y1 = sub_unpackv_y1;
return cntl;
}
void bli_hemv_cntl_obj_init( hemv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
gemv_t* sub_gemv_n_rp,
gemv_t* sub_gemv_n_cp,
gemv_t* sub_gemv_t_rp,
gemv_t* sub_gemv_t_cp,
hemv_t* sub_hemv,
unpackv_t* sub_unpackv_y1 )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_scalv = sub_scalv;
cntl->sub_packm_a11 = sub_packm_a11;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_packv_y1 = sub_packv_y1;
cntl->sub_gemv_n_rp = sub_gemv_n_rp;
cntl->sub_gemv_n_cp = sub_gemv_n_cp;
cntl->sub_gemv_t_rp = sub_gemv_t_rp;
cntl->sub_gemv_t_cp = sub_gemv_t_cp;
cntl->sub_hemv = sub_hemv;
cntl->sub_unpackv_y1 = sub_unpackv_y1;
}
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_cntl.h 0000664 0000000 0000000 00000007274 14634250137 0024132 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct hemv_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct scalv_s* sub_scalv;
struct packm_s* sub_packm_a11;
struct packv_s* sub_packv_x1;
struct packv_s* sub_packv_y1;
struct gemv_s* sub_gemv_n_rp;
struct gemv_s* sub_gemv_n_cp;
struct gemv_s* sub_gemv_t_rp;
struct gemv_s* sub_gemv_t_cp;
struct hemv_s* sub_hemv;
struct unpackv_s* sub_unpackv_y1;
};
typedef struct hemv_s hemv_t;
#define bli_cntl_sub_hemv( cntl ) cntl->sub_hemv
void bli_hemv_cntl_init( void );
void bli_hemv_cntl_finalize( void );
hemv_t* bli_hemv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
gemv_t* sub_gemv_n_rp,
gemv_t* sub_gemv_n_cp,
gemv_t* sub_gemv_t_rp,
gemv_t* sub_gemv_t_cp,
hemv_t* sub_hemv,
unpackv_t* sub_unpackv_y1 );
void bli_hemv_cntl_obj_init( hemv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
gemv_t* sub_gemv_n_rp,
gemv_t* sub_gemv_n_cp,
gemv_t* sub_gemv_t_rp,
gemv_t* sub_gemv_t_cp,
hemv_t* sub_hemv,
unpackv_t* sub_unpackv_y1 );
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_front.c 0000664 0000000 0000000 00000016255 14634250137 0024314 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern hemv_t* hemv_cntl_bs_ke_lrow_ucol;
extern hemv_t* hemv_cntl_bs_ke_lcol_urow;
extern hemv_t* hemv_cntl_ge_lrow_ucol;
extern hemv_t* hemv_cntl_ge_lcol_urow;
void bli_hemv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx
)
{
hemv_t* hemv_cntl;
num_t dt_targ_a;
num_t dt_targ_x;
num_t dt_targ_y;
bool a_has_unit_inc;
bool x_has_unit_inc;
bool y_has_unit_inc;
obj_t alpha_local;
obj_t beta_local;
num_t dt_alpha;
num_t dt_beta;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_hemv_check( alpha, a, x, beta, y );
// Query the target datatypes of each object.
dt_targ_a = bli_obj_target_dt( a );
dt_targ_x = bli_obj_target_dt( x );
dt_targ_y = bli_obj_target_dt( y );
// Determine whether each operand with unit stride.
a_has_unit_inc = ( bli_obj_is_row_stored( a ) ||
bli_obj_is_col_stored( a ) );
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of a and x to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of y. Here's why: If y is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*y will not be stored. If y is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = dt_targ_y;
bli_obj_scalar_init_detached_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( a_has_unit_inc &&
x_has_unit_inc &&
y_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( a ) )
{
if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lrow_ucol;
else hemv_cntl = hemv_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( a ) )
{
if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lcol_urow;
else hemv_cntl = hemv_cntl_bs_ke_lrow_ucol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a );
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_lower( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lrow_ucol;
else hemv_cntl = hemv_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lcol_urow;
else hemv_cntl = hemv_cntl_ge_lrow_ucol;
}
}
// Invoke the internal back-end with the copy-casts of scalars and the
// chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the
// Hermitian (and not symmetric) algorithms.
bli_hemv_int( BLIS_CONJUGATE,
&alpha_local,
a,
x,
&beta_local,
y,
cntx,
hemv_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, xo, betao, yo; \
\
inc_t rs_x, cs_x; \
inc_t rs_y, cs_y; \
\
rs_x = incx; cs_x = m * incx; \
rs_y = incy; cs_y = m * incy; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
\
bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conj( conjx, &xo ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &ao ); \
\
PASTEMAC0(opname)( &alphao, \
&ao, \
&xo, \
&betao, \
&yo, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( hemv_front )
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_front.h 0000664 0000000 0000000 00000004410 14634250137 0024307 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_hemv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx
);
//
// Prototype BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( hemv_front )
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_int.c 0000664 0000000 0000000 00000010244 14634250137 0023746 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T hemv_fp
typedef void (*FUNCPTR_T)( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl );
static FUNCPTR_T vars[4][3] =
{
// unblocked unblocked with fusing blocked
{ bli_hemv_unb_var1, bli_hemv_unf_var1, bli_hemv_blk_var1, },
{ bli_hemv_unb_var2, NULL, bli_hemv_blk_var2, },
{ bli_hemv_unb_var3, bli_hemv_unf_var3, bli_hemv_blk_var3, },
{ bli_hemv_unb_var4, NULL, bli_hemv_blk_var4, },
};
void bli_hemv_int( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl )
{
varnum_t n;
impl_t i;
FUNCPTR_T f;
obj_t a_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
{
if ( bli_is_conj( conjh ) ) bli_hemv_check( alpha, a, x, beta, y );
else bli_symv_check( alpha, a, x, beta, y );
}
// If y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( y ) ) return;
// If x has a zero dimension, scale y by beta and return early.
if ( bli_obj_has_zero_dim( x ) )
{
bli_scalm( beta, y );
return;
}
// Alias A in case we need to induce the upper triangular case.
bli_obj_alias_to( a, &a_local );
/*
// Our blocked algorithms only [explicitly] implement the lower triangular
// case, so if matrix A is stored as upper triangular, we must toggle the
// transposition (and conjugation) bits so that the diagonal partitioning
// routines grab the correct partitions corresponding to the upper
// triangular case. But we only need to do this for blocked algorithms,
// since unblocked algorithms are responsible for handling the upper case
// explicitly (and they should not be inspecting the transposition bit anyway).
if ( bli_cntl_is_blocked( cntl ) && bli_obj_is_upper( a ) )
{
bli_obj_toggle_conj( &a_local );
bli_obj_toggle_trans( &a_local );
}
*/
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( conjh,
alpha,
&a_local,
x,
beta,
y,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/hemv/other/bli_hemv_int.h 0000664 0000000 0000000 00000003614 14634250137 0023756 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_hemv_int( conj_t conjh,
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx,
hemv_t* cntl );
cython-blis-1.0.0/blis/_src/frame/2/her/ 0000775 0000000 0000000 00000000000 14634250137 0017640 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/her/bli_her.h 0000664 0000000 0000000 00000003463 14634250137 0021423 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_her_cntl.h"
//#include "bli_her_front.h"
//#include "bli_her_int.h"
#include "bli_her_var.h"
cython-blis-1.0.0/blis/_src/frame/2/her/bli_her_unb_var1.c 0000664 0000000 0000000 00000011353 14634250137 0023210 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x0; \
ctype* chi1; \
ctype* c10t; \
ctype* gamma11; \
ctype alpha_local; \
ctype alpha_chi1; \
ctype alpha_chi1_chi1; \
ctype conjx0_chi1; \
ctype conjx1_chi1; \
dim_t i; \
dim_t n_behind; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
\
/* Eliminate unused variable warnings. */ \
( void )conj0; \
\
/* Make a local copy of alpha and zero out the imaginary component if
we are being invoked as her, since her requires alpha to be real. */ \
PASTEMAC(ch,copys)( *alpha, alpha_local ); \
if ( bli_is_conj( conjh ) ) \
{ \
PASTEMAC(ch,seti0s)( alpha_local ); \
} \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx, but only if we are being invoked
as her; for syr, conjx is unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx as needed to arrive at the effective
conjugation for the scalar and vector subproblems. */ \
conj0 = conjx; \
conj1 = bli_apply_conj( conjh, conjx ); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
\
/* Apply conjx to chi1. */ \
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
PASTEMAC(ch,copycjs)( conj1, *chi1, conjx1_chi1 ); \
\
/* Compute scalar for vector subproblem. */ \
PASTEMAC(ch,scal2s)( alpha_local, conjx0_chi1, alpha_chi1 ); \
\
/* Compute alpha * chi1 * conj(chi1) after chi1 has already been
conjugated, if needed, by conjx. */ \
PASTEMAC(ch,scal2s)( alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \
\
/* c10t = c10t + alpha * chi1 * x0'; */ \
kfp_av \
( \
conj1, \
n_behind, \
&alpha_chi1, \
x0, incx, \
c10t, cs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha_chi1_chi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/her/bli_her_unb_var2.c 0000664 0000000 0000000 00000011355 14634250137 0023213 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* chi1; \
ctype* x2; \
ctype* gamma11; \
ctype* c21; \
ctype alpha_local; \
ctype alpha_chi1; \
ctype alpha_chi1_chi1; \
ctype conjx0_chi1; \
ctype conjx1_chi1; \
dim_t i; \
dim_t n_ahead; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
\
/* Eliminate unused variable warnings. */ \
( void )conj0; \
\
/* Make a local copy of alpha and zero out the imaginary component if
we are being invoked as her, since her requires alpha to be real. */ \
PASTEMAC(ch,copys)( *alpha, alpha_local ); \
if ( bli_is_conj( conjh ) ) \
{ \
PASTEMAC(ch,seti0s)( alpha_local ); \
} \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx, but only if we are being invoked
as her; for syr, conjx is unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx as needed to arrive at the effective
conjugation for the scalar and vector subproblems. */ \
conj0 = bli_apply_conj( conjh, conjx ); \
conj1 = conjx; \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_ahead = m - i - 1; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
\
/* Apply conjx to chi1. */ \
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
PASTEMAC(ch,copycjs)( conj1, *chi1, conjx1_chi1 ); \
\
/* Compute scalar for vector subproblem. */ \
PASTEMAC(ch,scal2s)( alpha_local, conjx0_chi1, alpha_chi1 ); \
\
/* Compute alpha * chi1 * conj(chi1) after chi1 has already been
conjugated, if needed, by conjx. */ \
PASTEMAC(ch,scal2s)( alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \
\
/* c21 = c21 + alpha * x2 * conj(chi1); */ \
kfp_av \
( \
conj1, \
n_ahead, \
&alpha_chi1, \
x2, incx, \
c21, rs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha_chi1_chi1, *gamma11 ); \
\
/* For her, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/her/bli_her_var.h 0000664 0000000 0000000 00000005025 14634250137 0022267 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
conj_t conjh, \
obj_t* alpha, \
obj_t* x, \
obj_t* c, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( her_blk_var1 )
GENPROT( her_blk_var2 )
GENPROT( her_unb_var1 )
GENPROT( her_unb_var2 )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjh, \
dim_t m, \
ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROTR_BASIC0( her_unb_var1 )
INSERT_GENTPROTR_BASIC0( her_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/her/bli_her_var_oapi.c 0000664 0000000 0000000 00000005452 14634250137 0023276 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
conj_t conjh, \
obj_t* alpha, \
obj_t* x, \
obj_t* c, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( c ); \
\
uplo_t uplo = bli_obj_uplo( c ); \
conj_t conjx = bli_obj_conj_status( x ); \
\
dim_t m = bli_obj_length( c ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_c = bli_obj_buffer_at_off( c ); \
inc_t rs_c = bli_obj_row_stride( c ); \
inc_t cs_c = bli_obj_col_stride( c ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
uplo, \
conjx, \
conjh, \
m, \
buf_alpha, \
buf_x, incx, \
buf_c, rs_c, cs_c, \
cntx \
); \
} \
GENFRONT( her, her_unb_var1 )
GENFRONT( her, her_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/her/other/ 0000775 0000000 0000000 00000000000 14634250137 0020761 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_blk_var1.c 0000664 0000000 0000000 00000011245 14634250137 0024315 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her_blk_var1( conj_t conjh,
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx,
her_t* cntl )
{
obj_t c11, c11_pack;
obj_t c10;
obj_t x1, x1_pack;
obj_t x0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &c11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( c );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, x1, and x0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, c, &c11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, c, &c10 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
// Initialize objects for packing C11 and x1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack C11, x1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// C10 = C10 + alpha * x1 * x0';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha,
&x1_pack,
&x0,
&c10,
cntx,
bli_cntl_sub_ger( cntl ) );
// C11 = C11 + alpha * x1 * x1';
bli_her_int( conjh,
alpha,
&x1_pack,
&c11_pack,
cntx,
bli_cntl_sub_her( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_blk_var2.c 0000664 0000000 0000000 00000011245 14634250137 0024316 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her_blk_var2( conj_t conjh,
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx,
her_t* cntl )
{
obj_t c11, c11_pack;
obj_t c21;
obj_t x1, x1_pack;
obj_t x2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &c11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( c );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C21, x1, and x2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, c, &c11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, c, &c21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
// Initialize objects for packing C11 and x1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack C11, x1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// C21 = C21 + alpha * x2 * x1';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha,
&x2,
&x1_pack,
&c21,
cntx,
bli_cntl_sub_ger( cntl ) );
// C11 = C11 + alpha * x1 * x1';
bli_her_int( conjh,
alpha,
&x1_pack,
&c11_pack,
cntx,
bli_cntl_sub_her( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_cntl.c 0000664 0000000 0000000 00000012737 14634250137 0023563 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackm_t* unpackm_cntl;
extern ger_t* ger_cntl_rp_bs_row;
extern ger_t* ger_cntl_cp_bs_col;
extern ger_t* ger_cntl_bs_ke_row;
extern ger_t* ger_cntl_bs_ke_col;
her_t* her_cntl_bs_ke_lrow_ucol = NULL;
her_t* her_cntl_bs_ke_lcol_urow = NULL;
her_t* her_cntl_ge_lrow_ucol = NULL;
her_t* her_cntl_ge_lcol_urow = NULL;
void bli_her_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (persumably) relatively small block-subvector problems.
her_cntl_bs_ke_lrow_ucol
=
bli_her_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL,
NULL, NULL );
her_cntl_bs_ke_lcol_urow
=
bli_her_cntl_obj_create( BLIS_UNBLOCKED,
BLIS_VARIANT2,
0,
NULL, NULL, NULL,
NULL, NULL );
// Create control trees for generally large problems. Here, we choose
// variants that partition for ger subproblems in the same direction
// as the assumed storage.
her_cntl_ge_lrow_ucol
=
bli_her_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
packv_cntl, // pack x1 (if needed)
NULL, // do NOT pack C11
ger_cntl_rp_bs_row,
her_cntl_bs_ke_lrow_ucol,
NULL ); // no unpacking needed
her_cntl_ge_lcol_urow
=
bli_her_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT2,
BLIS_M2,
packv_cntl, // pack x1 (if needed)
NULL, // do NOT pack C11
ger_cntl_cp_bs_col,
her_cntl_bs_ke_lcol_urow,
NULL ); // no unpacking needed
}
void bli_her_cntl_finalize()
{
bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol );
bli_cntl_free_node( her_cntl_bs_ke_lcol_urow );
bli_cntl_free_node( her_cntl_ge_lrow_ucol );
bli_cntl_free_node( her_cntl_ge_lcol_urow );
}
her_t* bli_her_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packm_t* sub_packm_c11,
ger_t* sub_ger,
her_t* sub_her,
unpackm_t* sub_unpackm_c11 )
{
her_t* cntl;
cntl = ( her_t* ) bli_malloc_intl( sizeof(her_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_packm_c11 = sub_packm_c11;
cntl->sub_ger = sub_ger;
cntl->sub_her = sub_her;
cntl->sub_unpackm_c11 = sub_unpackm_c11;
return cntl;
}
void bli_her_cntl_obj_init( her_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packm_t* sub_packm_c11,
ger_t* sub_ger,
her_t* sub_her,
unpackm_t* sub_unpackm_c11 )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_packm_c11 = sub_packm_c11;
cntl->sub_ger = sub_ger;
cntl->sub_her = sub_her;
cntl->sub_unpackm_c11 = sub_unpackm_c11;
}
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_cntl.h 0000664 0000000 0000000 00000005652 14634250137 0023566 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct her_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct packv_s* sub_packv_x1;
struct packm_s* sub_packm_c11;
struct ger_s* sub_ger;
struct her_s* sub_her;
struct unpackm_s* sub_unpackm_c11;
};
typedef struct her_s her_t;
#define bli_cntl_sub_her( cntl ) cntl->sub_her
void bli_her_cntl_init( void );
void bli_her_cntl_finalize( void );
her_t* bli_her_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packm_t* sub_packm_c11,
ger_t* sub_ger,
her_t* sub_her,
unpackm_t* sub_unpackm_c11 );
void bli_her_cntl_obj_init( her_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packm_t* sub_packm_c11,
ger_t* sub_ger,
her_t* sub_her,
unpackm_t* sub_unpackm_c11 );
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_front.c 0000664 0000000 0000000 00000013373 14634250137 0023750 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern her_t* her_cntl_bs_ke_lrow_ucol;
extern her_t* her_cntl_bs_ke_lcol_urow;
extern her_t* her_cntl_ge_lrow_ucol;
extern her_t* her_cntl_ge_lcol_urow;
void bli_her_front
(
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx
)
{
her_t* her_cntl;
num_t dt_targ_x;
//num_t dt_targ_c;
bool x_has_unit_inc;
bool c_has_unit_inc;
obj_t alpha_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_her_check( alpha, x, c );
// Query the target datatypes of each object.
dt_targ_x = bli_obj_target_dt( x );
//dt_targ_c = bli_obj_target_dt( c );
// Determine whether each operand with unit stride.
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
c_has_unit_inc = ( bli_obj_is_row_stored( c ) ||
bli_obj_is_col_stored( c ) );
// Create object to hold a copy-cast of alpha.
dt_alpha = dt_targ_x;
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( x_has_unit_inc &&
c_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol;
else her_cntl = her_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lcol_urow;
else her_cntl = her_cntl_bs_ke_lrow_ucol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lrow_ucol;
else her_cntl = her_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lcol_urow;
else her_cntl = her_cntl_ge_lrow_ucol;
}
}
// Invoke the internal back-end with the copy-cast scalar and the
// chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the
// Hermitian (and not symmetric) algorithms.
bli_her_int( BLIS_CONJUGATE,
&alpha_local,
x,
c,
cntx,
her_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
dim_t m, \
ctype_r* alpha, \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, xo, co; \
\
inc_t rs_x, cs_x; \
\
rs_x = incx; cs_x = m * incx; \
\
bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conj( conjx, &xo ); \
bli_obj_set_uplo( uploc, &co ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC0(opname)( &alphao, \
&xo, \
&co, \
cntx ); \
}
INSERT_GENTFUNCR_BASIC0( her_front )
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_front.h 0000664 0000000 0000000 00000004135 14634250137 0023751 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_her_front
(
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx
);
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
dim_t m, \
ctype_r* alpha, \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROTR_BASIC( her_front )
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_int.c 0000664 0000000 0000000 00000007160 14634250137 0023407 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T her_fp
typedef void (*FUNCPTR_T)( conj_t conjh,
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx,
her_t* cntl );
static FUNCPTR_T vars[4][3] =
{
// unblocked unblocked with fusing blocked
{ bli_her_unb_var1, NULL, bli_her_blk_var1, },
{ bli_her_unb_var2, NULL, bli_her_blk_var2, },
{ NULL, NULL, NULL, },
{ NULL, NULL, NULL, },
};
void bli_her_int( conj_t conjh,
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx,
her_t* cntl )
{
varnum_t n;
impl_t i;
FUNCPTR_T f;
obj_t x_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
{
if ( bli_is_conj( conjh ) ) bli_her_check( alpha, x, c );
else bli_syr_check( alpha, x, c );
}
// If C or x has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) ) return;
if ( bli_obj_has_zero_dim( x ) ) return;
// Alias the operands in case we need to apply conjugations.
bli_obj_alias_to( x, &x_local );
bli_obj_alias_to( c, &c_local );
// If matrix C is marked for conjugation, we interpret this as a request
// to apply a conjugation to the other operands.
if ( bli_obj_has_conj( &c_local ) )
{
bli_obj_toggle_conj( &c_local );
// Notice that we don't need to conjugate alpha since it is guaranteed
// to be real.
bli_obj_toggle_conj( &x_local );
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( conjh,
alpha,
&x_local,
&c_local,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/her/other/bli_her_int.h 0000664 0000000 0000000 00000003510 14634250137 0023407 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_her_int( conj_t conjh,
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx,
her_t* cntl );
cython-blis-1.0.0/blis/_src/frame/2/her2/ 0000775 0000000 0000000 00000000000 14634250137 0017722 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2.h 0000664 0000000 0000000 00000003467 14634250137 0021573 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_her2_cntl.h"
//#include "bli_her2_front.h"
//#include "bli_her2_int.h"
#include "bli_her2_var.h"
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_unb_var1.c 0000664 0000000 0000000 00000012451 14634250137 0023354 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x0; \
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype* c10t; \
ctype* gamma11; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_chi1; \
ctype alpha1_psi1; \
ctype alpha0_chi1_psi1; \
ctype conjx0_chi1; \
ctype conjy1_psi1; \
ctype conjy0_psi1; \
dim_t i; \
dim_t n_behind; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = bli_apply_conj( conjh, conjy ); \
conj1 = bli_apply_conj( conjh, conjx ); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \
PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \
PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
\
/* c10t = c10t + alpha * chi1 * y0'; */ \
kfp_av \
( \
conj0, \
n_behind, \
&alpha0_chi1, \
y0, incy, \
c10t, cs_ct, \
cntx \
); \
\
/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
kfp_av \
( \
conj1, \
n_behind, \
&alpha1_psi1, \
x0, incx, \
c10t, cs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_unb_var2.c 0000664 0000000 0000000 00000013027 14634250137 0023355 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x0; \
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype* c10t; \
ctype* gamma11; \
ctype* c21; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_psi1; \
ctype alpha1_psi1; \
ctype alpha0_chi1_psi1; \
ctype conjy0_psi1; \
ctype conjy1_psi1; \
ctype conjx0_chi1; \
dim_t i; \
dim_t n_behind; \
dim_t n_ahead; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
conj_t conjh_conjy; \
\
/* Eliminate unused variable warnings. */ \
( void )conjh_conjy; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = conjx; \
conj1 = bli_apply_conj( conjh, conjx ); \
conjh_conjy = bli_apply_conj( conjh, conjy ); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
n_ahead = m - i - 1; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
\
/* c21 = c21 + alpha * x2 * conj(psi1); */ \
kfp_av \
( \
conj0, \
n_ahead, \
&alpha0_psi1, \
x2, incx, \
c21, rs_ct, \
cntx \
); \
\
/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
kfp_av \
( \
conj1, \
n_behind, \
&alpha1_psi1, \
x0, incx, \
c10t, cs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_unb_var3.c 0000664 0000000 0000000 00000013025 14634250137 0023354 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype* y2; \
ctype* c10t; \
ctype* gamma11; \
ctype* c21; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_chi1; \
ctype alpha1_chi1; \
ctype alpha0_chi1_psi1; \
ctype conjx0_chi1; \
ctype conjx1_chi1; \
ctype conjy0_psi1; \
dim_t i; \
dim_t n_behind; \
dim_t n_ahead; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
conj_t conjh_conjx; \
\
/* Eliminate unused variable warnings. */ \
( void )conjh_conjx; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = bli_apply_conj( conjh, conjy ); \
conj1 = conjy; \
conjh_conjx = bli_apply_conj( conjh, conjx ); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
n_ahead = m - i - 1; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \
PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
\
/* c10t = c10t + alpha * chi1 * y0'; */ \
kfp_av \
( \
conj0, \
n_behind, \
&alpha0_chi1, \
y0, incy, \
c10t, cs_ct, \
cntx \
); \
\
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
kfp_av \
( \
conj1, \
n_ahead, \
&alpha1_chi1, \
y2, incy, \
c21, rs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unb_var3 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_unb_var4.c 0000664 0000000 0000000 00000012767 14634250137 0023371 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype* y2; \
ctype* gamma11; \
ctype* c21; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_psi1; \
ctype alpha1_chi1; \
ctype alpha0_chi1_psi1; \
ctype conjy0_psi1; \
ctype conjx1_chi1; \
ctype conjx0_chi1; \
dim_t i; \
dim_t n_ahead; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
conj_t conjh_conjx; \
conj_t conjh_conjy; \
\
/* Eliminate unused variable warnings. */ \
( void )conjh_conjx; \
( void )conjh_conjy; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = conjx; \
conj1 = conjy; \
conjh_conjx = bli_apply_conj( conjh, conjx ); \
conjh_conjy = bli_apply_conj( conjh, conjy ); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_ahead = m - i - 1; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
\
/* c21 = c21 + alpha * x2 * conj(psi1); */ \
kfp_av \
( \
conj0, \
n_ahead, \
&alpha0_psi1, \
x2, incx, \
c21, rs_ct, \
cntx \
); \
\
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
kfp_av \
( \
conj1, \
n_ahead, \
&alpha1_chi1, \
y2, incy, \
c21, rs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unb_var4 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_unf_var1.c 0000664 0000000 0000000 00000012350 14634250137 0023356 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* x0; \
ctype* chi1; \
ctype* y0; \
ctype* psi1; \
ctype* c10t; \
ctype* gamma11; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_chi1; \
ctype alpha1_psi1; \
ctype alpha0_chi1_psi1; \
ctype conjx0_chi1; \
ctype conjy1_psi1; \
ctype conjy0_psi1; \
dim_t i; \
dim_t n_behind; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = bli_apply_conj( conjh, conjy ); \
conj1 = bli_apply_conj( conjh, conjx ); \
\
PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
\
/* Query the context for the kernel function pointer. */ \
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_behind = i; \
x0 = x + (0 )*incx; \
chi1 = x + (i )*incx; \
y0 = y + (0 )*incy; \
psi1 = y + (i )*incy; \
c10t = c + (i )*rs_ct + (0 )*cs_ct; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \
PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \
PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \
\
/* c10t = c10t + alpha * chi1 * y0'; */ \
/* c10t = c10t + conj(alpha) * psi1 * x0'; */ \
kfp_2v \
( \
conj0, \
conj1, \
n_behind, \
&alpha0_chi1, \
&alpha1_psi1, \
y0, incy, \
x0, incx, \
c10t, cs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unf_var1 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_unf_var4.c 0000664 0000000 0000000 00000012670 14634250137 0023366 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* chi1; \
ctype* x2; \
ctype* psi1; \
ctype* y2; \
ctype* gamma11; \
ctype* c21; \
ctype alpha0; \
ctype alpha1; \
ctype alpha0_psi1; \
ctype alpha1_chi1; \
ctype alpha0_chi1_psi1; \
ctype conjy0_psi1; \
ctype conjx1_chi1; \
ctype conjx0_chi1; \
dim_t i; \
dim_t n_ahead; \
inc_t rs_ct, cs_ct; \
conj_t conj0, conj1; \
conj_t conjh_conjx; \
conj_t conjh_conjy; \
\
/* Eliminate unused variable warnings. */ \
( void )conjh_conjx; \
( void )conjh_conjy; \
\
/* The algorithm will be expressed in terms of the lower triangular case;
the upper triangular case is supported by swapping the row and column
strides of A and toggling some conj parameters. */ \
if ( bli_is_lower( uplo ) ) \
{ \
rs_ct = rs_c; \
cs_ct = cs_c; \
\
PASTEMAC(ch,copys)( *alpha, alpha0 ); \
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \
} \
else /* if ( bli_is_upper( uplo ) ) */ \
{ \
rs_ct = cs_c; \
cs_ct = rs_c; \
\
/* Toggle conjugation of conjx/conjy, but only if we are being invoked
as her2; for syr2, conjx/conjy are unchanged. */ \
conjx = bli_apply_conj( conjh, conjx ); \
conjy = bli_apply_conj( conjh, conjy ); \
\
PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \
PASTEMAC(ch,copys)( *alpha, alpha1 ); \
} \
\
/* Apply conjh (which carries the conjugation component of the Hermitian
transpose, if applicable) to conjx and/or conjy as needed to arrive at
the effective conjugation for the vector subproblems. */ \
conj0 = conjx; \
conj1 = conjy; \
conjh_conjx = bli_apply_conj( conjh, conjx ); \
conjh_conjy = bli_apply_conj( conjh, conjy ); \
\
PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
\
/* Query the context for the kernel function pointer. */ \
kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
\
for ( i = 0; i < m; ++i ) \
{ \
n_ahead = m - i - 1; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
psi1 = y + (i )*incy; \
y2 = y + (i+1)*incy; \
gamma11 = c + (i )*rs_ct + (i )*cs_ct; \
c21 = c + (i+1)*rs_ct + (i )*cs_ct; \
\
/* Apply conjx and/or conjy to chi1 and/or psi1. */ \
PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \
PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \
PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \
\
/* Compute scalars for vector subproblems. */ \
PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \
PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \
\
/* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have
already been conjugated, if needed, by conjx and conjy. */ \
PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \
\
/* c21 = c21 + alpha * x2 * conj(psi1); */ \
/* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \
kfp_2v \
( \
conj0, \
conj1, \
n_ahead, \
&alpha0_psi1, \
&alpha1_chi1, \
x2, incx, \
y2, incy, \
c21, rs_ct, \
cntx \
); \
\
/* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \
+ conj(alpha) * psi1 * conj(chi1); */ \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \
\
/* For her2, explicitly set the imaginary component of gamma11 to
zero. */ \
if ( bli_is_conj( conjh ) ) \
PASTEMAC(ch,seti0s)( *gamma11 ); \
} \
}
INSERT_GENTFUNC_BASIC0( her2_unf_var4 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_var.h 0000664 0000000 0000000 00000005555 14634250137 0022443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
conj_t conjh, \
obj_t* alpha, \
obj_t* alpha_conj, \
obj_t* x, \
obj_t* y, \
obj_t* c, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( her2_blk_var1 )
GENPROT( her2_blk_var2 )
GENPROT( her2_blk_var3 )
GENPROT( her2_blk_var4 )
GENPROT( her2_unb_var1 )
GENPROT( her2_unb_var2 )
GENPROT( her2_unb_var3 )
GENPROT( her2_unb_var4 )
GENPROT( her2_unf_var1 )
GENPROT( her2_unf_var4 )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uplo, \
conj_t conjx, \
conj_t conjy, \
conj_t conjh, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( her2_unb_var1 )
INSERT_GENTPROT_BASIC0( her2_unb_var2 )
INSERT_GENTPROT_BASIC0( her2_unb_var3 )
INSERT_GENTPROT_BASIC0( her2_unb_var4 )
INSERT_GENTPROT_BASIC0( her2_unf_var1 )
INSERT_GENTPROT_BASIC0( her2_unf_var4 )
cython-blis-1.0.0/blis/_src/frame/2/her2/bli_her2_var_oapi.c 0000664 0000000 0000000 00000006232 14634250137 0023437 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
conj_t conjh, \
obj_t* alpha, \
obj_t* alpha_conj, \
obj_t* x, \
obj_t* y, \
obj_t* c, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( c ); \
\
uplo_t uplo = bli_obj_uplo( c ); \
conj_t conjx = bli_obj_conj_status( x ); \
conj_t conjy = bli_obj_conj_status( y ); \
\
dim_t m = bli_obj_length( c ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_y = bli_obj_buffer_at_off( y ); \
inc_t incy = bli_obj_vector_inc( y ); \
\
void* buf_c = bli_obj_buffer_at_off( c ); \
inc_t rs_c = bli_obj_row_stride( c ); \
inc_t cs_c = bli_obj_col_stride( c ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
uplo, \
conjx, \
conjy, \
conjh, \
m, \
buf_alpha, \
buf_x, incx, \
buf_y, incy, \
buf_c, rs_c, cs_c, \
cntx \
); \
} \
GENFRONT( her2, her2_unb_var1 )
GENFRONT( her2, her2_unb_var2 )
GENFRONT( her2, her2_unb_var3 )
GENFRONT( her2, her2_unb_var4 )
GENFRONT( her2, her2_unf_var1 )
GENFRONT( her2, her2_unf_var4 )
cython-blis-1.0.0/blis/_src/frame/2/her2/other/ 0000775 0000000 0000000 00000000000 14634250137 0021043 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_blk_var1.c 0000664 0000000 0000000 00000013103 14634250137 0024454 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her2_blk_var1( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl )
{
obj_t c11, c11_pack;
obj_t c10;
obj_t x1, x1_pack;
obj_t x0;
obj_t y1, y1_pack;
obj_t y0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &c11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( c );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, x1, x0, y1, and y0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, c, &c11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, c, &c10 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, y, &y0 );
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C10 = C10 + alpha * x1 * y0';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha,
&x1_pack,
&y0,
&c10,
cntx,
bli_cntl_sub_ger_rp( cntl ) );
// C10 = C10 + conj(alpha) * y1 * x0';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha_conj,
&y1_pack,
&x0,
&c10,
cntx,
bli_cntl_sub_ger_rp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
alpha,
alpha_conj,
&x1_pack,
&y1_pack,
&c11_pack,
cntx,
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_blk_var2.c 0000664 0000000 0000000 00000013262 14634250137 0024463 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her2_blk_var2( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl )
{
obj_t c11, c11_pack;
obj_t c10;
obj_t c21;
obj_t x1, x1_pack;
obj_t x0;
obj_t x2;
obj_t y1, y1_pack;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &c11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( c );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, C21, x1, x0, x2, and y1.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, c, &c11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, c, &c10 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, c, &c21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C10 = C10 + conj(alpha) * y1 * x0';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha_conj,
&y1_pack,
&x0,
&c10,
cntx,
bli_cntl_sub_ger_rp( cntl ) );
// C21 = C21 + alpha * x2 * y1';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha,
&x2,
&y1_pack,
&c21,
cntx,
bli_cntl_sub_ger_cp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
alpha,
alpha_conj,
&x1_pack,
&y1_pack,
&c11_pack,
cntx,
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_blk_var3.c 0000664 0000000 0000000 00000013262 14634250137 0024464 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her2_blk_var3( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl )
{
obj_t c11, c11_pack;
obj_t c10;
obj_t c21;
obj_t x1, x1_pack;
obj_t y1, y1_pack;
obj_t y0;
obj_t y2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &c11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( c );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C10, C21, x1, y1, y0, and y2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, c, &c11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, c, &c10 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, c, &c21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, y, &y0 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, y, &y2 );
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C10 = C10 + alpha * x1 * y0';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha,
&x1_pack,
&y0,
&c10,
cntx,
bli_cntl_sub_ger_rp( cntl ) );
// C21 = C21 + conj(alpha) * y2 * x1';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha_conj,
&y2,
&x1_pack,
&c21,
cntx,
bli_cntl_sub_ger_cp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
alpha,
alpha_conj,
&x1_pack,
&y1_pack,
&c11_pack,
cntx,
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_blk_var4.c 0000664 0000000 0000000 00000013103 14634250137 0024457 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_her2_blk_var4( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl )
{
obj_t c11, c11_pack;
obj_t c21;
obj_t x1, x1_pack;
obj_t x2;
obj_t y1, y1_pack;
obj_t y2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Even though this blocked algorithm is expressed only in terms of the
// lower triangular case, the upper triangular case is still supported:
// when bli_acquire_mpart_tl2br() is passed a matrix that is stored in
// in the upper triangle, and the requested subpartition resides in the
// lower triangle (as is the case for this algorithm), the routine fills
// the request as if the caller had actually requested the corresponding
// "mirror" subpartition in the upper triangle, except that it marks the
// subpartition for transposition (and conjugation).
// Initialize objects for packing.
bli_obj_init_pack( &c11_pack );
bli_obj_init_pack( &x1_pack );
bli_obj_init_pack( &y1_pack );
// Query dimension.
mn = bli_obj_length( c );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, c,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for C11, C21, x1, x2, y1, and y2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, c, &c11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, c, &c21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, y, &y1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, y, &y2 );
// Initialize objects for packing C11, x1, and y1 (if needed).
bli_packm_init( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_init( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// Copy/pack C11, x1, y1 (if needed).
bli_packm_int( &c11, &c11_pack,
cntx, bli_cntl_sub_packm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_int( &y1, &y1_pack,
cntx, bli_cntl_sub_packv_y1( cntl ) );
// C21 = C21 + alpha * x2 * y1';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha,
&x2,
&y1_pack,
&c21,
cntx,
bli_cntl_sub_ger_cp( cntl ) );
// C21 = C21 + conj(alpha) * y2 * x1';
bli_ger_int( BLIS_NO_CONJUGATE,
conjh,
alpha_conj,
&y2,
&x1_pack,
&c21,
cntx,
bli_cntl_sub_ger_cp( cntl ) );
// C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1';
bli_her2_int( conjh,
alpha,
alpha_conj,
&x1_pack,
&y1_pack,
&c11_pack,
cntx,
bli_cntl_sub_her2( cntl ) );
// Copy/unpack C11 (if C11 was packed).
bli_unpackm_int( &c11_pack, &c11,
cntx, bli_cntl_sub_unpackm_c11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_cntl.c 0000664 0000000 0000000 00000014226 14634250137 0023722 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackm_t* unpackm_cntl;
extern ger_t* ger_cntl_rp_bs_row;
extern ger_t* ger_cntl_cp_bs_col;
her2_t* her2_cntl_bs_ke_lrow_ucol = NULL;
her2_t* her2_cntl_bs_ke_lcol_urow = NULL;
her2_t* her2_cntl_ge_lrow_ucol = NULL;
her2_t* her2_cntl_ge_lcol_urow = NULL;
void bli_her2_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (persumably) relatively small block-subvector problems.
her2_cntl_bs_ke_lrow_ucol
=
bli_her2_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL,
NULL, NULL, NULL,
NULL );
her2_cntl_bs_ke_lcol_urow
=
bli_her2_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT4,
0,
NULL, NULL, NULL,
NULL, NULL, NULL,
NULL );
// Create control trees for generally large problems. Here, we choose
// variants that partition for ger subproblems in the same direction
// as the assumed storage.
her2_cntl_ge_lrow_ucol
=
bli_her2_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1,
BLIS_M2,
packv_cntl, // pack x1 (if needed)
packv_cntl, // pack y1 (if needed)
packm_cntl, // pack C11 (if needed)
ger_cntl_rp_bs_row,
ger_cntl_rp_bs_row,
her2_cntl_bs_ke_lrow_ucol,
unpackm_cntl ); // unpack C11 (if packed)
her2_cntl_ge_lcol_urow
=
bli_her2_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT4,
BLIS_M2,
packv_cntl, // pack x1 (if needed)
packv_cntl, // pack y1 (if needed)
packm_cntl, // pack C11 (if needed)
ger_cntl_cp_bs_col,
ger_cntl_cp_bs_col,
her2_cntl_bs_ke_lcol_urow,
unpackm_cntl ); // unpack C11 (if packed)
}
void bli_her2_cntl_finalize()
{
bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol );
bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow );
bli_cntl_free_node( her2_cntl_ge_lrow_ucol );
bli_cntl_free_node( her2_cntl_ge_lcol_urow );
}
her2_t* bli_her2_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
packm_t* sub_packm_c11,
ger_t* sub_ger_rp,
ger_t* sub_ger_cp,
her2_t* sub_her2,
unpackm_t* sub_unpackm_c11 )
{
her2_t* cntl;
cntl = ( her2_t* ) bli_malloc_intl( sizeof(her2_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_packv_y1 = sub_packv_y1;
cntl->sub_packm_c11 = sub_packm_c11;
cntl->sub_ger_rp = sub_ger_rp;
cntl->sub_ger_cp = sub_ger_cp;
cntl->sub_her2 = sub_her2;
cntl->sub_unpackm_c11 = sub_unpackm_c11;
return cntl;
}
void bli_her2_cntl_obj_init( her2_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
packm_t* sub_packm_c11,
ger_t* sub_ger_rp,
ger_t* sub_ger_cp,
her2_t* sub_her2,
unpackm_t* sub_unpackm_c11 )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_packv_y1 = sub_packv_y1;
cntl->sub_packm_c11 = sub_packm_c11;
cntl->sub_ger_rp = sub_ger_rp;
cntl->sub_ger_cp = sub_ger_cp;
cntl->sub_her2 = sub_her2;
cntl->sub_unpackm_c11 = sub_unpackm_c11;
}
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_cntl.h 0000664 0000000 0000000 00000006370 14634250137 0023730 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct her2_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct packv_s* sub_packv_x1;
struct packv_s* sub_packv_y1;
struct packm_s* sub_packm_c11;
struct ger_s* sub_ger_rp;
struct ger_s* sub_ger_cp;
struct her2_s* sub_her2;
struct unpackm_s* sub_unpackm_c11;
};
typedef struct her2_s her2_t;
#define bli_cntl_sub_her2( cntl ) cntl->sub_her2
void bli_her2_cntl_init( void );
void bli_her2_cntl_finalize( void );
her2_t* bli_her2_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
packm_t* sub_packm_c11,
ger_t* sub_ger_rp,
ger_t* sub_ger_cp,
her2_t* sub_her2,
unpackm_t* sub_unpackm_c11 );
void bli_her2_cntl_obj_init( her2_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packv_t* sub_packv_x1,
packv_t* sub_packv_y1,
packm_t* sub_packm_c11,
ger_t* sub_ger_rp,
ger_t* sub_ger_cp,
her2_t* sub_her2,
unpackm_t* sub_unpackm_c11 );
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_front.c 0000664 0000000 0000000 00000015213 14634250137 0024107 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern her2_t* her2_cntl_bs_ke_lrow_ucol;
extern her2_t* her2_cntl_bs_ke_lcol_urow;
extern her2_t* her2_cntl_ge_lrow_ucol;
extern her2_t* her2_cntl_ge_lcol_urow;
void bli_her2_front
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx
)
{
her2_t* her2_cntl;
num_t dt_targ_x;
num_t dt_targ_y;
//num_t dt_targ_c;
bool x_has_unit_inc;
bool y_has_unit_inc;
bool c_has_unit_inc;
obj_t alpha_local;
obj_t alpha_conj_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_her2_check( alpha, x, y, c );
// Query the target datatypes of each object.
dt_targ_x = bli_obj_target_dt( x );
dt_targ_y = bli_obj_target_dt( y );
//dt_targ_c = bli_obj_target_dt( c );
// Determine whether each operand with unit stride.
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 );
c_has_unit_inc = ( bli_obj_is_row_stored( c ) ||
bli_obj_is_col_stored( c ) );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the datatypes of x and y.
dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Also create a conjugated copy of alpha.
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_CONJUGATE,
alpha,
&alpha_conj_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( x_has_unit_inc &&
y_has_unit_inc &&
c_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol;
else her2_cntl = her2_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow;
else her2_cntl = her2_cntl_bs_ke_lrow_ucol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y );
if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lrow_ucol;
else her2_cntl = her2_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lcol_urow;
else her2_cntl = her2_cntl_ge_lrow_ucol;
}
}
// Invoke the internal back-end with the copy-cast scalar and the
// chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the
// Hermitian (and not symmetric) algorithms.
bli_her2_int( BLIS_CONJUGATE,
&alpha_local,
&alpha_conj_local,
x,
y,
c,
cntx,
her2_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, xo, yo, co; \
\
inc_t rs_x, cs_x; \
inc_t rs_y, cs_y; \
\
rs_x = incx; cs_x = m * incx; \
rs_y = incy; cs_y = m * incy; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conj( conjx, &xo ); \
bli_obj_set_conj( conjy, &yo ); \
bli_obj_set_uplo( uploc, &co ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC0(opname)( &alphao, \
&xo, \
&yo, \
&co, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( her2_front )
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_front.h 0000664 0000000 0000000 00000004232 14634250137 0024113 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_her2_front
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( her2_front )
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_int.c 0000664 0000000 0000000 00000010734 14634250137 0023554 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T her2_fp
typedef void (*FUNCPTR_T)( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl );
static FUNCPTR_T vars[4][3] =
{
// unblocked unblocked with fusing blocked
{ bli_her2_unb_var1, bli_her2_unf_var1, bli_her2_blk_var1 },
{ bli_her2_unb_var2, NULL, bli_her2_blk_var2 },
{ bli_her2_unb_var3, NULL, bli_her2_blk_var3 },
{ bli_her2_unb_var4, bli_her2_unf_var4, bli_her2_blk_var4 },
};
void bli_her2_int( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl )
{
varnum_t n;
impl_t i;
FUNCPTR_T f;
obj_t alpha_local;
obj_t alpha_conj_local;
obj_t x_local;
obj_t y_local;
obj_t c_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
{
if ( bli_is_conj( conjh ) ) bli_her2_check( alpha, x, y, c );
else bli_syr2_check( alpha, x, y, c );
}
// If C, x, or y has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) ) return;
if ( bli_obj_has_zero_dim( x ) ) return;
if ( bli_obj_has_zero_dim( y ) ) return;
// Alias the operands in case we need to apply conjugations.
bli_obj_alias_to( x, &x_local );
bli_obj_alias_to( y, &y_local );
bli_obj_alias_to( c, &c_local );
// If matrix C is marked for conjugation, we interpret this as a request
// to apply a conjugation to the other operands.
if ( bli_obj_has_conj( &c_local ) )
{
bli_obj_toggle_conj( &c_local );
bli_obj_toggle_conj( &x_local );
bli_obj_toggle_conj( &y_local );
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ),
BLIS_CONJUGATE,
alpha,
&alpha_local );
bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha_conj ),
BLIS_CONJUGATE,
alpha_conj,
&alpha_conj_local );
}
else
{
bli_obj_alias_to( *alpha, alpha_local );
bli_obj_alias_to( *alpha_conj, alpha_conj_local );
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[n][i];
// Invoke the variant.
f( conjh,
&alpha_local,
&alpha_conj_local,
&x_local,
&y_local,
&c_local,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/her2/other/bli_her2_int.h 0000664 0000000 0000000 00000003623 14634250137 0023560 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_her2_int( conj_t conjh,
obj_t* alpha,
obj_t* alpha_conj,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx,
her2_t* cntl );
cython-blis-1.0.0/blis/_src/frame/2/symv/ 0000775 0000000 0000000 00000000000 14634250137 0020060 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/symv/bli_symv.h 0000664 0000000 0000000 00000003344 14634250137 0022061 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_symv_front.h"
cython-blis-1.0.0/blis/_src/frame/2/symv/other/ 0000775 0000000 0000000 00000000000 14634250137 0021201 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/symv/other/bli_symv_front.c 0000664 0000000 0000000 00000016264 14634250137 0024412 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern hemv_t* hemv_cntl_bs_ke_lrow_ucol;
extern hemv_t* hemv_cntl_bs_ke_lcol_urow;
extern hemv_t* hemv_cntl_ge_lrow_ucol;
extern hemv_t* hemv_cntl_ge_lcol_urow;
void bli_symv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx
)
{
hemv_t* hemv_cntl;
num_t dt_targ_a;
num_t dt_targ_x;
num_t dt_targ_y;
bool a_has_unit_inc;
bool x_has_unit_inc;
bool y_has_unit_inc;
obj_t alpha_local;
obj_t beta_local;
num_t dt_alpha;
num_t dt_beta;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_symv_check( alpha, a, x, beta, y );
// Query the target datatypes of each object.
dt_targ_a = bli_obj_target_dt( a );
dt_targ_x = bli_obj_target_dt( x );
dt_targ_y = bli_obj_target_dt( y );
// Determine whether each operand with unit stride.
a_has_unit_inc = ( bli_obj_is_row_stored( a ) ||
bli_obj_is_col_stored( a ) );
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of a and x to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// Create an object to hold a copy-cast of beta. Notice that we use
// the datatype of y. Here's why: If y is real and beta is complex,
// there is no reason to keep beta_local in the complex domain since
// the complex part of beta*y will not be stored. If y is complex and
// beta is real then beta is harmlessly promoted to complex.
dt_beta = dt_targ_y;
bli_obj_scalar_init_detached_copy_of( dt_beta,
BLIS_NO_CONJUGATE,
beta,
&beta_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( a_has_unit_inc &&
x_has_unit_inc &&
y_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( a ) )
{
if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lrow_ucol;
else hemv_cntl = hemv_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( a ) )
{
if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lcol_urow;
else hemv_cntl = hemv_cntl_bs_ke_lrow_ucol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a );
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_lower( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lrow_ucol;
else hemv_cntl = hemv_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lcol_urow;
else hemv_cntl = hemv_cntl_ge_lrow_ucol;
}
}
// Invoke the internal back-end with the copy-casts of scalars and the
// chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the
// symmetric (and not Hermitian) algorithms.
bli_hemv_int( BLIS_NO_CONJUGATE,
&alpha_local,
a,
x,
&beta_local,
y,
cntx,
hemv_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, xo, betao, yo; \
\
inc_t rs_x, cs_x; \
inc_t rs_y, cs_y; \
\
rs_x = incx; cs_x = m * incx; \
rs_y = incy; cs_y = m * incy; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \
\
bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conj( conjx, &xo ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &ao ); \
\
PASTEMAC0(opname)( &alphao, \
&ao, \
&xo, \
&betao, \
&yo, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( symv_front )
cython-blis-1.0.0/blis/_src/frame/2/symv/other/bli_symv_front.h 0000664 0000000 0000000 00000004276 14634250137 0024417 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_symv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
obj_t* beta,
obj_t* y,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
conj_t conja, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
ctype* beta, \
ctype* y, inc_t incy, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( symv_front )
cython-blis-1.0.0/blis/_src/frame/2/syr/ 0000775 0000000 0000000 00000000000 14634250137 0017677 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/syr/bli_syr.h 0000664 0000000 0000000 00000003343 14634250137 0021516 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_syr_front.h"
cython-blis-1.0.0/blis/_src/frame/2/syr/other/ 0000775 0000000 0000000 00000000000 14634250137 0021020 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/syr/other/bli_syr_front.c 0000664 0000000 0000000 00000013560 14634250137 0024044 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern her_t* her_cntl_bs_ke_lrow_ucol;
extern her_t* her_cntl_bs_ke_lcol_urow;
extern her_t* her_cntl_ge_lrow_ucol;
extern her_t* her_cntl_ge_lcol_urow;
void bli_syr_front
(
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx
)
{
her_t* her_cntl;
num_t dt_targ_x;
num_t dt_targ_c;
bool x_has_unit_inc;
bool c_has_unit_inc;
obj_t alpha_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syr_check( alpha, x, c );
// Query the target datatypes of each object.
dt_targ_x = bli_obj_target_dt( x );
dt_targ_c = bli_obj_target_dt( c );
// Determine whether each operand with unit stride.
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
c_has_unit_inc = ( bli_obj_is_row_stored( c ) ||
bli_obj_is_col_stored( c ) );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of x and c to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_x, dt_targ_c );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( x_has_unit_inc &&
c_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol;
else her_cntl = her_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lcol_urow;
else her_cntl = her_cntl_bs_ke_lrow_ucol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lrow_ucol;
else her_cntl = her_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lcol_urow;
else her_cntl = her_cntl_ge_lrow_ucol;
}
}
// Invoke the internal back-end with the copy-cast scalar and the
// chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the
// symmetric (and not Hermitian) algorithms.
bli_her_int( BLIS_NO_CONJUGATE,
&alpha_local,
x,
c,
cntx,
her_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, xo, co; \
\
inc_t rs_x, cs_x; \
\
rs_x = incx; cs_x = m * incx; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conj( conjx, &xo ); \
bli_obj_set_uplo( uploc, &co ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC0(opname)( &alphao, \
&xo, \
&co, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( syr_front )
cython-blis-1.0.0/blis/_src/frame/2/syr/other/bli_syr_front.h 0000664 0000000 0000000 00000004114 14634250137 0024044 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_syr_front
(
obj_t* alpha,
obj_t* x,
obj_t* c,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( syr_front )
cython-blis-1.0.0/blis/_src/frame/2/syr2/ 0000775 0000000 0000000 00000000000 14634250137 0017761 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/syr2/bli_syr2.h 0000664 0000000 0000000 00000003344 14634250137 0021663 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_syr2_front.h"
cython-blis-1.0.0/blis/_src/frame/2/syr2/other/ 0000775 0000000 0000000 00000000000 14634250137 0021102 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/syr2/other/bli_syr2_front.c 0000664 0000000 0000000 00000014562 14634250137 0024213 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern her2_t* her2_cntl_bs_ke_lrow_ucol;
extern her2_t* her2_cntl_bs_ke_lcol_urow;
extern her2_t* her2_cntl_ge_lrow_ucol;
extern her2_t* her2_cntl_ge_lcol_urow;
void bli_syr2_front
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx
)
{
her2_t* her2_cntl;
num_t dt_targ_x;
num_t dt_targ_y;
//num_t dt_targ_c;
bool x_has_unit_inc;
bool y_has_unit_inc;
bool c_has_unit_inc;
obj_t alpha_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syr2_check( alpha, x, y, c );
// Query the target datatypes of each object.
dt_targ_x = bli_obj_target_dt( x );
dt_targ_y = bli_obj_target_dt( y );
//dt_targ_c = bli_obj_target_dt( c );
// Determine whether each operand with unit stride.
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 );
c_has_unit_inc = ( bli_obj_is_row_stored( c ) ||
bli_obj_is_col_stored( c ) );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the datatypes of x and y.
dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( x_has_unit_inc &&
y_has_unit_inc &&
c_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of upper/lower triangular storage and row/column-storage.
// The row-stored lower triangular and column-stored upper triangular
// trees are identical. Same for the remaining two trees.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol;
else her2_cntl = her2_cntl_bs_ke_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow;
else her2_cntl = her2_cntl_bs_ke_lrow_ucol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y );
if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_is_lower( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lrow_ucol;
else her2_cntl = her2_cntl_ge_lcol_urow;
}
else // if ( bli_obj_is_upper( c ) )
{
if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lcol_urow;
else her2_cntl = her2_cntl_ge_lrow_ucol;
}
}
// Invoke the internal back-end with the copy-cast scalar and the
// chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the
// symmetric (and not Hermitian) algorithms.
bli_her2_int( BLIS_NO_CONJUGATE,
&alpha_local,
&alpha_local,
x,
y,
c,
cntx,
her2_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, xo, yo, co; \
\
inc_t rs_x, cs_x; \
inc_t rs_y, cs_y; \
\
rs_x = incx; cs_x = m * incx; \
rs_y = incy; cs_y = m * incy; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \
bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conj( conjx, &xo ); \
bli_obj_set_conj( conjy, &yo ); \
bli_obj_set_uplo( uploc, &co ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC0(opname)( &alphao, \
&xo, \
&yo, \
&co, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( syr2_front )
cython-blis-1.0.0/blis/_src/frame/2/syr2/other/bli_syr2_front.h 0000664 0000000 0000000 00000004232 14634250137 0024211 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_syr2_front
(
obj_t* alpha,
obj_t* x,
obj_t* y,
obj_t* c,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
conj_t conjx, \
conj_t conjy, \
dim_t m, \
ctype* alpha, \
ctype* x, inc_t incx, \
ctype* y, inc_t incy, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( syr2_front )
cython-blis-1.0.0/blis/_src/frame/2/trmv/ 0000775 0000000 0000000 00000000000 14634250137 0020052 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv.h 0000664 0000000 0000000 00000003470 14634250137 0022045 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_trmv_cntl.h"
//#include "bli_trmv_front.h"
//#include "bli_trmv_int.h"
#include "bli_trmv_var.h"
cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv_unb_var1.c 0000664 0000000 0000000 00000010652 14634250137 0023635 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a10t; \
ctype* alpha11; \
ctype* a12t; \
ctype* x0; \
ctype* chi1; \
ctype* x2; \
ctype alpha_alpha11_conj; \
ctype rho; \
dim_t iter, i; \
dim_t n_ahead; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotv_ker_ft) kfp_dv; \
\
/* Query the context for the kernel function pointer. */ \
kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_ahead = m - iter - 1; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a12t = a + (i )*rs_at + (i+1)*cs_at; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
\
/* chi1 = alpha * alpha11 * chi1; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
\
/* chi1 = chi1 + alpha * a12t * x2; */ \
kfp_dv \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
a12t, cs_at, \
x2, incx, \
&rho, \
cntx \
); \
PASTEMAC(ch,axpys)( *alpha, rho, *chi1 ); \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_ahead = i; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a10t = a + (i )*rs_at + (0 )*cs_at; \
chi1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* chi1 = alpha * alpha11 * chi1; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
\
/* chi1 = chi1 + alpha * a10t * x0; */ \
kfp_dv \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
a10t, cs_at, \
x0, incx, \
&rho, \
cntx \
); \
PASTEMAC(ch,axpys)( *alpha, rho, *chi1 ); \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trmv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv_unb_var2.c 0000664 0000000 0000000 00000010614 14634250137 0023634 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a01; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* chi1; \
ctype* x2; \
ctype alpha_alpha11_conj; \
ctype alpha_chi1; \
dim_t iter, i; \
dim_t n_behind; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_behind = i; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a01 = a + (0 )*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* x0 = x0 + alpha * chi1 * a01; */ \
PASTEMAC(ch,scal2s)( *alpha, *chi1, alpha_chi1 ); \
kfp_av \
( \
conja, \
n_behind, \
&alpha_chi1, \
a01, rs_at, \
x0, incx, \
cntx \
); \
\
/* chi1 = alpha * alpha11 * chi1; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_behind = iter; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a21 = a + (i+1)*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
\
/* x2 = x2 + alpha * chi1 * a21; */ \
PASTEMAC(ch,scal2s)( *alpha, *chi1, alpha_chi1 ); \
kfp_av \
( \
conja, \
n_behind, \
&alpha_chi1, \
a21, rs_at, \
x2, incx, \
cntx \
); \
\
/* chi1 = alpha * alpha11 * chi1; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trmv_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv_unf_var1.c 0000664 0000000 0000000 00000014357 14634250137 0023647 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* A10; \
ctype* A11; \
ctype* A12; \
ctype* a10t; \
ctype* alpha11; \
ctype* a12t; \
ctype* x0; \
ctype* x1; \
ctype* x2; \
ctype* x01; \
ctype* chi11; \
ctype* x21; \
ctype alpha_alpha11_conj; \
ctype rho1; \
dim_t iter, i, k, j, l; \
dim_t b_fuse, f; \
dim_t n_ahead, f_ahead; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_ahead = m - iter - f; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A12 = a + (i )*rs_at + (i+f)*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
\
/* x1 = alpha * A11 * x1; */ \
for ( k = 0; k < f; ++k ) \
{ \
l = k; \
f_ahead = f - l - 1; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a12t = A11 + (l )*rs_at + (l+1)*cs_at; \
chi11 = x1 + (l )*incx; \
x21 = x1 + (l+1)*incx; \
\
/* chi11 = alpha * alpha11 * chi11; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
\
/* chi11 = chi11 + alpha * a12t * x21; */ \
PASTEMAC(ch,set0s)( rho1 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
} \
PASTEMAC(ch,axpys)( *alpha, rho1, *chi11 ); \
} \
\
/* x1 = x1 + alpha * A12 * x2; */ \
kfp_df \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
f, \
alpha, \
A12, cs_at, rs_at, \
x2, incx, \
one, \
x1, incx, \
cntx \
); \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_ahead = i; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A10 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* x1 = alpha * A11 * x1; */ \
for ( k = 0; k < f; ++k ) \
{ \
l = f - k - 1; \
f_ahead = l; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a10t = A11 + (l )*rs_at + (0 )*cs_at; \
chi11 = x1 + (l )*incx; \
x01 = x1 + (0 )*incx; \
\
/* chi11 = alpha * alpha11 * chi11; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
\
/* chi11 = chi11 + alpha * a10t * x01; */ \
PASTEMAC(ch,set0s)( rho1 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
} \
PASTEMAC(ch,axpys)( *alpha, rho1, *chi11 ); \
} \
\
/* x1 = x1 + alpha * A10 * x0; */ \
kfp_df \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
f, \
alpha, \
A10, cs_at, rs_at, \
x0, incx, \
one, \
x1, incx, \
cntx \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trmv_unf_var1 )
cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv_unf_var2.c 0000664 0000000 0000000 00000014216 14634250137 0023642 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* A01; \
ctype* A11; \
ctype* A21; \
ctype* a01; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* x1; \
ctype* x2; \
ctype* x01; \
ctype* chi11; \
ctype* x21; \
ctype alpha_alpha11_conj; \
ctype alpha_chi11; \
dim_t iter, i, k, j, l; \
dim_t b_fuse, f; \
dim_t n_behind, f_behind; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,axpyf_ker_ft) kfp_af; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_behind = i; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A01 = a + (0 )*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* x0 = x0 + alpha * A01 * x1; */ \
kfp_af \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
alpha, \
A01, rs_at, cs_at, \
x1, incx, \
x0, incx, \
cntx \
); \
\
/* x1 = alpha * A11 * x1; */ \
for ( k = 0; k < f; ++k ) \
{ \
l = k; \
f_behind = l; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a01 = A11 + (0 )*rs_at + (l )*cs_at; \
chi11 = x1 + (l )*incx; \
x01 = x1 + (0 )*incx; \
\
/* x01 = x01 + alpha * chi11 * a01; */ \
PASTEMAC(ch,scal2s)( *alpha, *chi11, alpha_chi11 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
} \
\
/* chi11 = alpha * alpha11 * chi11; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
} \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_behind = iter; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A21 = a + (i+f)*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
\
/* x2 = x2 + alpha * A21 * x1; */ \
kfp_af \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
alpha, \
A21, rs_at, cs_at, \
x1, incx, \
x2, incx, \
cntx \
); \
\
/* x1 = alpha * A11 * x1; */ \
for ( k = 0; k < f; ++k ) \
{ \
l = f - k - 1; \
f_behind = k; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a21 = A11 + (l+1)*rs_at + (l )*cs_at; \
chi11 = x1 + (l )*incx; \
x21 = x1 + (l+1)*incx; \
\
/* x21 = x21 + alpha * chi11 * a21; */ \
PASTEMAC(ch,scal2s)( *alpha, *chi11, alpha_chi11 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
} \
\
/* chi11 = alpha * alpha11 * chi11; */ \
PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \
if ( bli_is_nonunit_diag( diaga ) ) \
PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \
PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trmv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv_var.h 0000664 0000000 0000000 00000005164 14634250137 0022717 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( trmv_l_blk_var1 )
GENPROT( trmv_l_blk_var2 )
GENPROT( trmv_u_blk_var1 )
GENPROT( trmv_u_blk_var2 )
GENPROT( trmv_unb_var1 )
GENPROT( trmv_unb_var2 )
GENPROT( trmv_unf_var1 )
GENPROT( trmv_unf_var2 )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( trmv_unb_var1 )
INSERT_GENTPROT_BASIC0( trmv_unb_var2 )
INSERT_GENTPROT_BASIC0( trmv_unf_var1 )
INSERT_GENTPROT_BASIC0( trmv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trmv/bli_trmv_var_oapi.c 0000664 0000000 0000000 00000005612 14634250137 0023720 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uploa = bli_obj_uplo( a ); \
trans_t transa = bli_obj_conjtrans_status( a ); \
diag_t diaga = bli_obj_diag( a ); \
\
dim_t m = bli_obj_length( a ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
uploa, \
transa, \
diaga, \
m, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
cntx \
); \
} \
GENFRONT( trmv, trmv_unb_var1 )
GENFRONT( trmv, trmv_unb_var2 )
GENFRONT( trmv, trmv_unf_var1 )
GENFRONT( trmv, trmv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/ 0000775 0000000 0000000 00000000000 14634250137 0021173 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_cntl.c 0000664 0000000 0000000 00000014040 14634250137 0024174 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackv_t* unpackv_cntl;
extern gemv_t* gemv_cntl_rp_bs_dot;
extern gemv_t* gemv_cntl_rp_bs_axpy;
extern gemv_t* gemv_cntl_cp_bs_dot;
extern gemv_t* gemv_cntl_cp_bs_axpy;
trmv_t* trmv_cntl_bs_ke_nrow_tcol = NULL;
trmv_t* trmv_cntl_bs_ke_ncol_trow = NULL;
trmv_t* trmv_cntl_ge_nrow_tcol = NULL;
trmv_t* trmv_cntl_ge_ncol_trow = NULL;
void bli_trmv_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (presumably) relatively small block-subvector problems.
trmv_cntl_bs_ke_nrow_tcol
=
bli_trmv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL,
NULL, NULL, NULL );
trmv_cntl_bs_ke_ncol_trow
=
bli_trmv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT2,
0,
NULL, NULL, NULL,
NULL, NULL, NULL );
// Create control trees for generally large problems. Here we choose a
// variant that prioritizes keeping a subvector of x in cache.
trmv_cntl_ge_nrow_tcol
=
bli_trmv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1, // use var1 to maximize x1 usage
BLIS_M2,
packm_cntl, // pack A11 (if needed)
packv_cntl, // pack x1 (if needed)
gemv_cntl_rp_bs_dot, // gemv_rp needed by var1
NULL, // gemv_cp not needed by var1
trmv_cntl_bs_ke_nrow_tcol,
unpackv_cntl ); // unpack x1 (if packed)
trmv_cntl_ge_ncol_trow
=
bli_trmv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1, // use var1 to maximize x1 usage
BLIS_M2,
packm_cntl, // pack A11 (if needed)
packv_cntl, // pack x1 (if needed)
gemv_cntl_rp_bs_axpy, // gemv_rp needed by var1
NULL, // gemv_cp not needed by var1
trmv_cntl_bs_ke_ncol_trow,
unpackv_cntl ); // unpack x1 (if packed)
}
void bli_trmv_cntl_finalize()
{
bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol );
bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow );
bli_cntl_free_node( trmv_cntl_ge_nrow_tcol );
bli_cntl_free_node( trmv_cntl_ge_ncol_trow );
}
trmv_t* bli_trmv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trmv_t* sub_trmv,
unpackv_t* sub_unpackv_x1 )
{
trmv_t* cntl;
cntl = ( trmv_t* ) bli_malloc_intl( sizeof(trmv_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packm_a11 = sub_packm_a11;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_gemv_rp = sub_gemv_rp;
cntl->sub_gemv_cp = sub_gemv_cp;
cntl->sub_trmv = sub_trmv;
cntl->sub_unpackv_x1 = sub_unpackv_x1;
return cntl;
}
void bli_trmv_cntl_obj_init( trmv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trmv_t* sub_trmv,
unpackv_t* sub_unpackv_x1 )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_packm_a11 = sub_packm_a11;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_gemv_rp = sub_gemv_rp;
cntl->sub_gemv_cp = sub_gemv_cp;
cntl->sub_trmv = sub_trmv;
cntl->sub_unpackv_x1 = sub_unpackv_x1;
}
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_cntl.h 0000664 0000000 0000000 00000006150 14634250137 0024204 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct trmv_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct packm_s* sub_packm_a11;
struct packv_s* sub_packv_x1;
struct gemv_s* sub_gemv_rp;
struct gemv_s* sub_gemv_cp;
struct trmv_s* sub_trmv;
struct unpackv_s* sub_unpackv_x1;
};
typedef struct trmv_s trmv_t;
#define bli_cntl_sub_trmv( cntl ) cntl->sub_trmv
void bli_trmv_cntl_init( void );
void bli_trmv_cntl_finalize( void );
trmv_t* bli_trmv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trmv_t* sub_trmv,
unpackv_t* sub_unpackv_x1 );
void bli_trmv_cntl_obj_init( trmv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trmv_t* sub_trmv,
unpackv_t* sub_unpackv_x1 );
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_front.c 0000664 0000000 0000000 00000013522 14634250137 0024370 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern trmv_t* trmv_cntl_bs_ke_nrow_tcol;
extern trmv_t* trmv_cntl_bs_ke_ncol_trow;
extern trmv_t* trmv_cntl_ge_nrow_tcol;
extern trmv_t* trmv_cntl_ge_ncol_trow;
void bli_trmv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx
)
{
trmv_t* trmv_cntl;
num_t dt_targ_a;
num_t dt_targ_x;
bool a_has_unit_inc;
bool x_has_unit_inc;
obj_t alpha_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmv_check( alpha, a, x );
// Query the target datatypes of each object.
dt_targ_a = bli_obj_target_dt( a );
dt_targ_x = bli_obj_target_dt( x );
// Determine whether each operand with unit stride.
a_has_unit_inc = ( bli_obj_is_row_stored( a ) ||
bli_obj_is_col_stored( a ) );
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of a and x to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( a_has_unit_inc &&
x_has_unit_inc )
{
// We use two control trees to handle the four cases corresponding to
// combinations of transposition and row/column-storage.
// The row-stored without transpose and column-stored with transpose
// trees are identical. Same for the remaining two trees.
if ( bli_obj_has_notrans( a ) )
{
if ( bli_obj_is_row_stored( a ) ) trmv_cntl = trmv_cntl_bs_ke_nrow_tcol;
else trmv_cntl = trmv_cntl_bs_ke_ncol_trow;
}
else // if ( bli_obj_has_trans( a ) )
{
if ( bli_obj_is_row_stored( a ) ) trmv_cntl = trmv_cntl_bs_ke_ncol_trow;
else trmv_cntl = trmv_cntl_bs_ke_nrow_tcol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a );
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_has_notrans( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) trmv_cntl = trmv_cntl_ge_nrow_tcol;
else trmv_cntl = trmv_cntl_ge_ncol_trow;
}
else // if ( bli_obj_has_trans( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) trmv_cntl = trmv_cntl_ge_ncol_trow;
else trmv_cntl = trmv_cntl_ge_nrow_tcol;
}
}
// Invoke the internal back-end with the copy-cast of alpha and the
// chosen control tree.
bli_trmv_int( &alpha_local,
a,
x,
cntx,
trmv_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, xo; \
\
inc_t rs_x, cs_x; \
\
rs_x = incx; cs_x = m * incx; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC0(opname)( &alphao, \
&ao, \
&xo, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( trmv_front )
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_front.h 0000664 0000000 0000000 00000004150 14634250137 0024372 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trmv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( trmv_front )
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_int.c 0000664 0000000 0000000 00000012017 14634250137 0024030 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T trmv_fp
typedef void (*FUNCPTR_T)( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl );
static FUNCPTR_T vars[2][3][3] =
{
// lower triangular
{
// unblocked unblocked with fusing blocked
{ bli_trmv_unb_var1, bli_trmv_unf_var1, bli_trmv_l_blk_var1 },
{ bli_trmv_unb_var2, bli_trmv_unf_var2, bli_trmv_l_blk_var2 },
{ NULL, NULL, NULL },
},
// upper triangular
{
// unblocked unblocked with fusing blocked
{ bli_trmv_unb_var1, bli_trmv_unf_var1, bli_trmv_u_blk_var1 },
{ bli_trmv_unb_var2, bli_trmv_unf_var2, bli_trmv_u_blk_var2 },
{ NULL, NULL, NULL },
}
};
void bli_trmv_int( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl )
{
varnum_t n;
impl_t i;
bool uplo;
FUNCPTR_T f;
obj_t a_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trmv_check( alpha, a, x );
// If A or x has a zero dimension, return early.
if ( bli_obj_has_zero_dim( a ) ) return;
if ( bli_obj_has_zero_dim( x ) ) return;
// Alias A in case we need to induce a transformation (ie: transposition).
bli_obj_alias_to( a, &a_local );
// NOTE: to support cases where B is complex and A is real, we will
// need to have the default side case be BLIS_RIGHT and then express
// the left case in terms of it, rather than the other way around.
// Determine uplo (for indexing to the correct function pointer).
if ( bli_obj_is_lower( &a_local ) ) uplo = 0;
else uplo = 1;
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply toggle the uplo value to cause the
// correct algorithm to be induced. When that algorithm partitions into
// A, it will grab the correct subpartitions, which will inherit A's
// transposition bit and thus downstream subproblems will do the right
// thing. Alternatively, we could accomplish the same end goal by
// inducing a transposition, via bli_obj_induce_trans(), in the code
// block below. That macro function swaps dimensions, strides, and
// offsets. As an example, given a lower triangular, column-major matrix
// that needs a transpose, we would induce that transposition by recasting
// the object as an upper triangular, row-major matrix (with no transpose
// needed). Note that how we choose to handle transposition here does NOT
// affect the optimal choice of kernel (ie: a column-major column panel
// matrix with transpose times a vector would use the same kernel as a
// row-major row panel matrix with no transpose times a vector).
if ( bli_obj_has_trans( &a_local ) )
{
//bli_obj_induce_trans( &a_local );
//bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
if ( uplo == 1 ) uplo = 0;
else uplo = 1;
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];
// Invoke the variant.
f( alpha,
&a_local,
x,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_int.h 0000664 0000000 0000000 00000003454 14634250137 0024042 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trmv_int( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl );
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_l_blk_var1.c 0000664 0000000 0000000 00000010075 14634250137 0025254 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmv_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a10;
obj_t x1, x1_pack;
obj_t x0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, and x0.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_br2tl( BLIS_SUBPART10,
ij, b_alg, a, &a10 );
bli_acquire_vpart_b2f( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_b2f( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = alpha * tril( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trmv( cntl ) );
// x1 = x1 + alpha * A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a10,
&x0,
&BLIS_ONE,
&x1_pack,
cntx,
bli_cntl_sub_gemv_rp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_l_blk_var2.c 0000664 0000000 0000000 00000010075 14634250137 0025255 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmv_l_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a21;
obj_t x1, x1_pack;
obj_t x2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A21, x1, and x2.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_br2tl( BLIS_SUBPART21,
ij, b_alg, a, &a21 );
bli_acquire_vpart_b2f( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_b2f( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x2 = x2 + alpha * A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a21,
&x1_pack,
&BLIS_ONE,
&x2,
cntx,
bli_cntl_sub_gemv_cp( cntl ) );
// x1 = alpha * tril( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trmv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_u_blk_var1.c 0000664 0000000 0000000 00000010075 14634250137 0025265 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmv_u_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a12;
obj_t x1, x1_pack;
obj_t x2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A12, x1, and x2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART12,
ij, b_alg, a, &a12 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = alpha * triu( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trmv( cntl ) );
// x1 = x1 + alpha * A12 * x2;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a12,
&x2,
&BLIS_ONE,
&x1_pack,
cntx,
bli_cntl_sub_gemv_rp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trmv/other/bli_trmv_u_blk_var2.c 0000664 0000000 0000000 00000010075 14634250137 0025266 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmv_u_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trmv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a01;
obj_t x1, x1_pack;
obj_t x0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A21, x1, and x2.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_br2tl( BLIS_SUBPART01,
ij, b_alg, a, &a01 );
bli_acquire_vpart_b2f( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_b2f( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x0 = x0 + alpha * A01 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
alpha,
&a01,
&x1_pack,
&BLIS_ONE,
&x0,
cntx,
bli_cntl_sub_gemv_cp( cntl ) );
// x1 = alpha * triu( A11 ) * x1;
bli_trmv_int( alpha,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trmv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trsv/ 0000775 0000000 0000000 00000000000 14634250137 0020060 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv.h 0000664 0000000 0000000 00000003470 14634250137 0022061 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// NOTE: level-2 control tree code is temporarily disabled.
//#include "bli_trsv_cntl.h"
//#include "bli_trsv_front.h"
//#include "bli_trsv_int.h"
#include "bli_trsv_var.h"
cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv_unb_var1.c 0000664 0000000 0000000 00000010650 14634250137 0023647 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a10t; \
ctype* alpha11; \
ctype* a12t; \
ctype* x0; \
ctype* chi1; \
ctype* x2; \
ctype alpha11_conj; \
ctype rho; \
dim_t iter, i; \
dim_t n_behind; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
/* x = alpha * x; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
alpha, \
x, incx, \
cntx, \
NULL \
); \
\
PASTECH(ch,dotv_ker_ft) kfp_tv; \
\
/* Query the context for the kernel function pointer. */ \
kfp_tv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_behind = iter; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a12t = a + (i )*rs_at + (i+1)*cs_at; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
\
/* chi1 = chi1 - a12t * x2; */ \
kfp_tv \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
a12t, cs_at, \
x2, incx, \
&rho, \
cntx \
); \
PASTEMAC(ch,subs)( rho, *chi1 ); \
\
/* chi1 = chi1 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
} \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_behind = i; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a10t = a + (i )*rs_at + (0 )*cs_at; \
chi1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* chi1 = chi1 - a10t * x0; */ \
kfp_tv \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
a10t, cs_at, \
x0, incx, \
&rho, \
cntx \
); \
PASTEMAC(ch,subs)( rho, *chi1 ); \
\
/* chi1 = chi1 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trsv_unb_var1 )
cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv_unb_var2.c 0000664 0000000 0000000 00000010624 14634250137 0023651 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* a01; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* chi1; \
ctype* x2; \
ctype alpha11_conj; \
ctype minus_chi1; \
dim_t iter, i; \
dim_t n_ahead; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
/* x = alpha * x; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
alpha, \
x, incx, \
cntx, \
NULL \
); \
\
PASTECH(ch,axpyv_ker_ft) kfp_av; \
\
/* Query the context for the kernel function pointer. */ \
kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = m - iter - 1; \
n_ahead = i; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a01 = a + (0 )*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* chi1 = chi1 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
} \
\
/* x0 = x0 - chi1 * a01; */ \
PASTEMAC(ch,neg2s)( *chi1, minus_chi1 ); \
kfp_av \
( \
conja, \
n_ahead, \
&minus_chi1, \
a01, rs_at, \
x0, incx, \
cntx \
); \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; ++iter ) \
{ \
i = iter; \
n_ahead = m - iter - 1; \
alpha11 = a + (i )*rs_at + (i )*cs_at; \
a21 = a + (i+1)*rs_at + (i )*cs_at; \
chi1 = x + (i )*incx; \
x2 = x + (i+1)*incx; \
\
/* chi1 = chi1 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \
} \
\
/* x2 = x2 - chi1 * a21; */ \
PASTEMAC(ch,neg2s)( *chi1, minus_chi1 ); \
kfp_av \
( \
conja, \
n_ahead, \
&minus_chi1, \
a21, rs_at, \
x2, incx, \
cntx \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trsv_unb_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv_unf_var1.c 0000664 0000000 0000000 00000014415 14634250137 0023656 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* one = PASTEMAC(ch,1); \
ctype* minus_one = PASTEMAC(ch,m1); \
ctype* A10; \
ctype* A11; \
ctype* A12; \
ctype* a10t; \
ctype* alpha11; \
ctype* a12t; \
ctype* x0; \
ctype* x1; \
ctype* x2; \
ctype* x01; \
ctype* chi11; \
ctype* x21; \
ctype alpha11_conj; \
ctype rho1; \
dim_t iter, i, k, j, l; \
dim_t b_fuse, f; \
dim_t n_behind, f_behind; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
/* x = alpha * x; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
alpha, \
x, incx, \
cntx, \
NULL \
); \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_behind = iter; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A12 = a + (i )*rs_at + (i+f)*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
\
/* x1 = x1 - A12 * x2; */ \
kfp_df \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
minus_one, \
A12, cs_at, rs_at, \
x2, incx, \
one, \
x1, incx, \
cntx \
); \
\
/* x1 = x1 / triu( A11 ); */ \
for ( k = 0; k < f; ++k ) \
{ \
l = f - k - 1; \
f_behind = k; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a12t = A11 + (l )*rs_at + (l+1)*cs_at; \
chi11 = x1 + (l )*incx; \
x21 = x1 + (l+1)*incx; \
\
/* chi11 = chi11 - a12t * x21; */ \
PASTEMAC(ch,set0s)( rho1 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \
} \
PASTEMAC(ch,subs)( rho1, *chi11 ); \
\
/* chi11 = chi11 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
} \
} \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_behind = i; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A10 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* x1 = x1 - A10 * x0; */ \
kfp_df \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_behind, \
f, \
minus_one, \
A10, cs_at, rs_at, \
x0, incx, \
one, \
x1, incx, \
cntx \
); \
\
/* x1 = x1 / tril( A11 ); */ \
for ( k = 0; k < f; ++k ) \
{ \
l = k; \
f_behind = l; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a10t = A11 + (l )*rs_at + (0 )*cs_at; \
chi11 = x1 + (l )*incx; \
x01 = x1 + (0 )*incx; \
\
/* chi11 = chi11 - a10t * x01; */ \
PASTEMAC(ch,set0s)( rho1 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
} \
else \
{ \
for ( j = 0; j < f_behind; ++j ) \
PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \
} \
PASTEMAC(ch,subs)( rho1, *chi11 ); \
\
/* chi11 = chi11 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trsv_unf_var1 )
cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv_unf_var2.c 0000664 0000000 0000000 00000014274 14634250137 0023662 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* minus_one = PASTEMAC(ch,m1); \
ctype* A01; \
ctype* A11; \
ctype* A21; \
ctype* a01; \
ctype* alpha11; \
ctype* a21; \
ctype* x0; \
ctype* x1; \
ctype* x2; \
ctype* x01; \
ctype* chi11; \
ctype* x21; \
ctype alpha11_conj; \
ctype minus_chi11; \
dim_t iter, i, k, j, l; \
dim_t b_fuse, f; \
dim_t n_ahead, f_ahead; \
inc_t rs_at, cs_at; \
uplo_t uploa_trans; \
conj_t conja; \
\
/* x = alpha * x; */ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m, \
alpha, \
x, incx, \
cntx, \
NULL \
); \
\
if ( bli_does_notrans( transa ) ) \
{ \
rs_at = rs_a; \
cs_at = cs_a; \
uploa_trans = uploa; \
} \
else /* if ( bli_does_trans( transa ) ) */ \
{ \
rs_at = cs_a; \
cs_at = rs_a; \
uploa_trans = bli_uplo_toggled( uploa ); \
} \
\
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,axpyf_ker_ft) kfp_af; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
\
/* We reduce all of the possible cases down to just lower/upper. */ \
if ( bli_is_upper( uploa_trans ) ) \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \
i = m - iter - f; \
n_ahead = i; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A01 = a + (0 )*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
x0 = x + (0 )*incx; \
\
/* x1 = x1 / triu( A11 ); */ \
for ( k = 0; k < f; ++k ) \
{ \
l = f - k - 1; \
f_ahead = l; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a01 = A11 + (0 )*rs_at + (l )*cs_at; \
chi11 = x1 + (l )*incx; \
x01 = x1 + (0 )*incx; \
\
/* chi11 = chi11 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
} \
\
/* x01 = x01 - chi11 * a01; */ \
PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \
} \
} \
\
/* x0 = x0 - A01 * x1; */ \
kfp_af \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
f, \
minus_one, \
A01, rs_at, cs_at, \
x1, incx, \
x0, incx, \
cntx \
); \
} \
} \
else /* if ( bli_is_lower( uploa_trans ) ) */ \
{ \
for ( iter = 0; iter < m; iter += f ) \
{ \
f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \
i = iter; \
n_ahead = m - iter - f; \
A11 = a + (i )*rs_at + (i )*cs_at; \
A21 = a + (i+f)*rs_at + (i )*cs_at; \
x1 = x + (i )*incx; \
x2 = x + (i+f)*incx; \
\
/* x1 = x1 / tril( A11 ); */ \
for ( k = 0; k < f; ++k ) \
{ \
l = k; \
f_ahead = f - k - 1; \
alpha11 = A11 + (l )*rs_at + (l )*cs_at; \
a21 = A11 + (l+1)*rs_at + (l )*cs_at; \
chi11 = x1 + (l )*incx; \
x21 = x1 + (l+1)*incx; \
\
/* chi11 = chi11 / alpha11; */ \
if ( bli_is_nonunit_diag( diaga ) ) \
{ \
PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \
PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \
} \
\
/* x21 = x21 - chi11 * a21; */ \
PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \
if ( bli_is_conj( conja ) ) \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
} \
else \
{ \
for ( j = 0; j < f_ahead; ++j ) \
PASTEMAC(ch,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \
} \
} \
\
/* x2 = x2 - A21 * x1; */ \
kfp_af \
( \
conja, \
BLIS_NO_CONJUGATE, \
n_ahead, \
f, \
minus_one, \
A21, rs_at, cs_at, \
x1, incx, \
x2, incx, \
cntx \
); \
} \
} \
}
INSERT_GENTFUNC_BASIC0( trsv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv_var.h 0000664 0000000 0000000 00000005164 14634250137 0022733 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
cntx_t* cntx, \
cntl_t* cntl \
);
GENPROT( trsv_l_blk_var1 )
GENPROT( trsv_l_blk_var2 )
GENPROT( trsv_u_blk_var1 )
GENPROT( trsv_u_blk_var2 )
GENPROT( trsv_unb_var1 )
GENPROT( trsv_unb_var2 )
GENPROT( trsv_unf_var1 )
GENPROT( trsv_unf_var2 )
//
// Prototype BLAS-like interfaces with typed operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC0( trsv_unb_var1 )
INSERT_GENTPROT_BASIC0( trsv_unb_var2 )
INSERT_GENTPROT_BASIC0( trsv_unf_var1 )
INSERT_GENTPROT_BASIC0( trsv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trsv/bli_trsv_var_oapi.c 0000664 0000000 0000000 00000005612 14634250137 0023734 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( opname, varname ) \
\
void PASTEMAC0(varname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* x, \
cntx_t* cntx, \
cntl_t* cntl \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( a ); \
\
uplo_t uploa = bli_obj_uplo( a ); \
trans_t transa = bli_obj_conjtrans_status( a ); \
diag_t diaga = bli_obj_diag( a ); \
\
dim_t m = bli_obj_length( a ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
inc_t rs_a = bli_obj_row_stride( a ); \
inc_t cs_a = bli_obj_col_stride( a ); \
\
void* buf_x = bli_obj_buffer_at_off( x ); \
inc_t incx = bli_obj_vector_inc( x ); \
\
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(opname,_unb,_vft) f = \
PASTEMAC(varname,_qfp)( dt ); \
\
f \
( \
uploa, \
transa, \
diaga, \
m, \
buf_alpha, \
buf_a, rs_a, cs_a, \
buf_x, incx, \
cntx \
); \
} \
GENFRONT( trsv, trsv_unb_var1 )
GENFRONT( trsv, trsv_unb_var2 )
GENFRONT( trsv, trsv_unf_var1 )
GENFRONT( trsv, trsv_unf_var2 )
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/ 0000775 0000000 0000000 00000000000 14634250137 0021201 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_cntl.c 0000664 0000000 0000000 00000014757 14634250137 0024227 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern scalv_t* scalv_cntl;
extern packm_t* packm_cntl;
extern packv_t* packv_cntl;
extern unpackv_t* unpackv_cntl;
extern gemv_t* gemv_cntl_rp_bs_dot;
extern gemv_t* gemv_cntl_rp_bs_axpy;
extern gemv_t* gemv_cntl_cp_bs_dot;
extern gemv_t* gemv_cntl_cp_bs_axpy;
trsv_t* trsv_cntl_bs_ke_nrow_tcol = NULL;
trsv_t* trsv_cntl_bs_ke_ncol_trow = NULL;
trsv_t* trsv_cntl_ge_nrow_tcol = NULL;
trsv_t* trsv_cntl_ge_ncol_trow = NULL;
void bli_trsv_cntl_init()
{
// Create control trees for the lowest-level kernels. These trees induce
// operations on (presumably) relatively small block-subvector problems.
trsv_cntl_bs_ke_nrow_tcol
=
bli_trsv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT1,
0,
NULL, NULL, NULL,
NULL, NULL, NULL,
NULL );
trsv_cntl_bs_ke_ncol_trow
=
bli_trsv_cntl_obj_create( BLIS_UNB_FUSED,
BLIS_VARIANT2,
0,
NULL, NULL, NULL,
NULL, NULL, NULL,
NULL );
// Create control trees for generally large problems. Here we choose a
// variant that prioritizes keeping a subvector of x in cache.
trsv_cntl_ge_nrow_tcol
=
bli_trsv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1, // use var1 to maximize x1 usage
BLIS_M2,
scalv_cntl, // scale x up-front
packm_cntl, // pack A11 (if needed)
packv_cntl, // pack x1 (if needed)
gemv_cntl_rp_bs_dot, // gemv_rp needed by var1
NULL, // gemv_cp not needed by var1
trsv_cntl_bs_ke_nrow_tcol,
unpackv_cntl ); // unpack x1 (if needed)
trsv_cntl_ge_ncol_trow
=
bli_trsv_cntl_obj_create( BLIS_BLOCKED,
BLIS_VARIANT1, // use var1 to maximize x1 usage
BLIS_M2,
scalv_cntl, // scale x up-front
packm_cntl, // pack A11 (if needed)
packv_cntl, // pack x1 (if needed)
gemv_cntl_rp_bs_axpy, // gemv_rp needed by var1
NULL, // gemv_cp not needed by var1
trsv_cntl_bs_ke_ncol_trow,
unpackv_cntl ); // unpack x1 (if needed)
}
void bli_trsv_cntl_finalize()
{
bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol );
bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow );
bli_cntl_free_node( trsv_cntl_ge_nrow_tcol );
bli_cntl_free_node( trsv_cntl_ge_ncol_trow );
}
trsv_t* bli_trsv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trsv_t* sub_trsv,
unpackv_t* sub_unpackv_x1 )
{
trsv_t* cntl;
cntl = ( trsv_t* ) bli_malloc_intl( sizeof(trsv_t) );
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_scalv = sub_scalv;
cntl->sub_packm_a11 = sub_packm_a11;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_gemv_rp = sub_gemv_rp;
cntl->sub_gemv_cp = sub_gemv_cp;
cntl->sub_trsv = sub_trsv;
cntl->sub_unpackv_x1 = sub_unpackv_x1;
return cntl;
}
void bli_trsv_cntl_obj_init( trsv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trsv_t* sub_trsv,
unpackv_t* sub_unpackv_x1 )
{
cntl->impl_type = impl_type;
cntl->var_num = var_num;
cntl->bszid = bszid;
cntl->sub_scalv = sub_scalv;
cntl->sub_packm_a11 = sub_packm_a11;
cntl->sub_packv_x1 = sub_packv_x1;
cntl->sub_gemv_rp = sub_gemv_rp;
cntl->sub_gemv_cp = sub_gemv_cp;
cntl->sub_trsv = sub_trsv;
cntl->sub_unpackv_x1 = sub_unpackv_x1;
}
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_cntl.h 0000664 0000000 0000000 00000006362 14634250137 0024225 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
struct trsv_s
{
impl_t impl_type;
varnum_t var_num;
bszid_t bszid;
struct scalv_s* sub_scalv;
struct packm_s* sub_packm_a11;
struct packv_s* sub_packv_x1;
struct gemv_s* sub_gemv_rp;
struct gemv_s* sub_gemv_cp;
struct trsv_s* sub_trsv;
struct unpackv_s* sub_unpackv_x1;
};
typedef struct trsv_s trsv_t;
#define bli_cntl_sub_trsv( cntl ) cntl->sub_trsv
void bli_trsv_cntl_init( void );
void bli_trsv_cntl_finalize( void );
trsv_t* bli_trsv_cntl_obj_create( impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trsv_t* sub_trsv,
unpackv_t* sub_unpackv_x1 );
void bli_trsv_cntl_obj_init( trsv_t* cntl,
impl_t impl_type,
varnum_t var_num,
bszid_t bszid,
scalv_t* sub_scalv,
packm_t* sub_packm_a11,
packv_t* sub_packv_x1,
gemv_t* sub_gemv_rp,
gemv_t* sub_gemv_cp,
trsv_t* sub_trsv,
unpackv_t* sub_unpackv_x1 );
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_front.c 0000664 0000000 0000000 00000013076 14634250137 0024410 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
extern trsv_t* trsv_cntl_bs_ke_nrow_tcol;
extern trsv_t* trsv_cntl_bs_ke_ncol_trow;
extern trsv_t* trsv_cntl_ge_nrow_tcol;
extern trsv_t* trsv_cntl_ge_ncol_trow;
void bli_trsv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx
)
{
trsv_t* trsv_cntl;
num_t dt_targ_a;
num_t dt_targ_x;
bool a_has_unit_inc;
bool x_has_unit_inc;
obj_t alpha_local;
num_t dt_alpha;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsv_check( alpha, a, x );
// Query the target datatypes of each object.
dt_targ_a = bli_obj_dt( a );
dt_targ_x = bli_obj_dt( x );
// Determine whether each operand with unit stride.
a_has_unit_inc = ( bli_obj_is_row_stored( a ) ||
bli_obj_is_col_stored( a ) );
x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 );
// Create an object to hold a copy-cast of alpha. Notice that we use
// the type union of the target datatypes of a and x to prevent any
// unnecessary loss of information during the computation.
dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x );
bli_obj_scalar_init_detached_copy_of( dt_alpha,
BLIS_NO_CONJUGATE,
alpha,
&alpha_local );
// If all operands have unit stride, we choose a control tree for calling
// the unblocked implementation directly without any blocking.
if ( a_has_unit_inc &&
x_has_unit_inc )
{
if ( bli_obj_has_notrans( a ) )
{
if ( bli_obj_is_row_stored( a ) ) trsv_cntl = trsv_cntl_bs_ke_nrow_tcol;
else trsv_cntl = trsv_cntl_bs_ke_ncol_trow;
}
else // if ( bli_obj_has_trans( a ) )
{
if ( bli_obj_is_row_stored( a ) ) trsv_cntl = trsv_cntl_bs_ke_ncol_trow;
else trsv_cntl = trsv_cntl_bs_ke_nrow_tcol;
}
}
else
{
// Mark objects with unit stride as already being packed. This prevents
// unnecessary packing from happening within the blocked algorithm.
if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a );
if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x );
// Here, we make a similar choice as above, except that (1) we look
// at storage tilt, and (2) we choose a tree that performs blocking.
if ( bli_obj_has_notrans( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) trsv_cntl = trsv_cntl_ge_nrow_tcol;
else trsv_cntl = trsv_cntl_ge_ncol_trow;
}
else // if ( bli_obj_has_trans( a ) )
{
if ( bli_obj_is_row_tilted( a ) ) trsv_cntl = trsv_cntl_ge_ncol_trow;
else trsv_cntl = trsv_cntl_ge_nrow_tcol;
}
}
// Invoke the internal back-end with the copy-cast of alpha and the
// chosen control tree.
bli_trsv_int( &alpha_local,
a,
x,
cntx,
trsv_cntl );
}
//
// Define BLAS-like interfaces with homogeneous-typed operands.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao, ao, xo; \
\
inc_t rs_x, cs_x; \
\
rs_x = incx; cs_x = m * incx; \
\
bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \
\
bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \
bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC0(opname)( &alphao, \
&ao, \
&xo, \
cntx ); \
}
INSERT_GENTFUNC_BASIC0( trsv_front )
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_front.h 0000664 0000000 0000000 00000004147 14634250137 0024414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trsv_front
(
obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* x, inc_t incx, \
cntx_t* cntx \
);
INSERT_GENTPROT_BASIC( trsv_front )
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_int.c 0000664 0000000 0000000 00000012020 14634250137 0024036 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T trsv_fp
typedef void (*FUNCPTR_T)( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl );
static FUNCPTR_T vars[2][3][3] =
{
// lower triangular
{
// unblocked unblocked with fusing blocked
{ bli_trsv_unb_var1, bli_trsv_unf_var1, bli_trsv_l_blk_var1 },
{ bli_trsv_unb_var2, bli_trsv_unf_var2, bli_trsv_l_blk_var2 },
{ NULL, NULL, NULL },
},
// upper triangular
{
// unblocked unblocked with fusing blocked
{ bli_trsv_unb_var1, bli_trsv_unf_var1, bli_trsv_u_blk_var1 },
{ bli_trsv_unb_var2, bli_trsv_unf_var2, bli_trsv_u_blk_var2 },
{ NULL, NULL, NULL },
}
};
void bli_trsv_int( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl )
{
varnum_t n;
impl_t i;
bool uplo;
FUNCPTR_T f;
obj_t a_local;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_trsv_check( alpha, a, x );
// If A or x has a zero dimension, return early.
if ( bli_obj_has_zero_dim( a ) ) return;
if ( bli_obj_has_zero_dim( x ) ) return;
// Alias A in case we need to induce a transformation (ie: transposition).
bli_obj_alias_to( a, &a_local );
// NOTE: to support cases where B is complex and A is real, we will
// need to have the default side case be BLIS_RIGHT and then express
// the left case in terms of it, rather than the other way around.
// Determine uplo (for indexing to the correct function pointer).
if ( bli_obj_is_lower( &a_local ) ) uplo = 0;
else uplo = 1;
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply toggle the uplo value to cause the
// correct algorithm to be induced. When that algorithm partitions into
// A, it will grab the correct subpartitions, which will inherit A's
// transposition bit and thus downstream subproblems will do the right
// thing. Alternatively, we could accomplish the same end goal by
// inducing a transposition, via bli_obj_induce_trans(), in the code
// block below. That macro function swaps dimensions, strides, and
// offsets. As an example, given a lower triangular, column-major matrix
// that needs a transpose, we would induce that transposition by recasting
// the object as an upper triangular, row-major matrix (with no transpose
// needed). Note that how we choose to handle transposition here does NOT
// affect the optimal choice of kernel (ie: a column-major column panel
// matrix with transpose times a vector would use the same kernel as a
// row-major row panel matrix with no transpose times a vector).
if ( bli_obj_has_trans( &a_local ) )
{
//bli_obj_induce_trans( &a_local );
//bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
if ( uplo == 1 ) uplo = 0;
else uplo = 1;
}
// Extract the variant number and implementation type.
n = bli_cntl_var_num( cntl );
i = bli_cntl_impl_type( cntl );
// Index into the variant array to extract the correct function pointer.
f = vars[uplo][n][i];
// Invoke the variant.
f( alpha,
&a_local,
x,
cntx,
cntl );
}
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_int.h 0000664 0000000 0000000 00000003454 14634250137 0024056 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trsv_int( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl );
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_l_blk_var1.c 0000664 0000000 0000000 00000010255 14634250137 0025270 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsv_l_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a10;
obj_t x1, x1_pack;
obj_t x0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A10, x1, and x0.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART10,
ij, b_alg, a, &a10 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 - A10 * x0;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
&BLIS_MINUS_ONE,
&a10,
&x0,
&BLIS_ONE,
&x1_pack,
cntx,
bli_cntl_sub_gemv_rp( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trsv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_l_blk_var2.c 0000664 0000000 0000000 00000010255 14634250137 0025271 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsv_l_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a21;
obj_t x1, x1_pack;
obj_t x2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_f( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A21, x1, and x2.
bli_acquire_mpart_tl2br( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_tl2br( BLIS_SUBPART21,
ij, b_alg, a, &a21 );
bli_acquire_vpart_f2b( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_f2b( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trsv( cntl ) );
// x2 = x2 - A21 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
&BLIS_MINUS_ONE,
&a21,
&x1_pack,
&BLIS_ONE,
&x2,
cntx,
bli_cntl_sub_gemv_cp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_u_blk_var1.c 0000664 0000000 0000000 00000010255 14634250137 0025301 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsv_u_blk_var1( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a12;
obj_t x1, x1_pack;
obj_t x2;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A12, x1, and x2.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_br2tl( BLIS_SUBPART12,
ij, b_alg, a, &a12 );
bli_acquire_vpart_b2f( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_b2f( BLIS_SUBPART2,
ij, b_alg, x, &x2 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 - A12 * x2;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
&BLIS_MINUS_ONE,
&a12,
&x2,
&BLIS_ONE,
&x1_pack,
cntx,
bli_cntl_sub_gemv_rp( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trsv( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/2/trsv/other/bli_trsv_u_blk_var2.c 0000664 0000000 0000000 00000010255 14634250137 0025302 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsv_u_blk_var2( obj_t* alpha,
obj_t* a,
obj_t* x,
cntx_t* cntx,
trsv_t* cntl )
{
obj_t a11, a11_pack;
obj_t a01;
obj_t x1, x1_pack;
obj_t x0;
dim_t mn;
dim_t ij;
dim_t b_alg;
// Initialize objects for packing.
bli_obj_init_pack( &a11_pack );
bli_obj_init_pack( &x1_pack );
// Query dimension.
mn = bli_obj_length( a );
// x = alpha * x;
bli_scalv_int( alpha,
x,
cntx, bli_cntl_sub_scalv( cntl ) );
// Partition diagonally.
for ( ij = 0; ij < mn; ij += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize_b( ij, mn, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A11, A01, x1, and x0.
bli_acquire_mpart_br2tl( BLIS_SUBPART11,
ij, b_alg, a, &a11 );
bli_acquire_mpart_br2tl( BLIS_SUBPART01,
ij, b_alg, a, &a01 );
bli_acquire_vpart_b2f( BLIS_SUBPART1,
ij, b_alg, x, &x1 );
bli_acquire_vpart_b2f( BLIS_SUBPART0,
ij, b_alg, x, &x0 );
// Initialize objects for packing A11 and x1 (if needed).
bli_packm_init( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_init( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// Copy/pack A11, x1 (if needed).
bli_packm_int( &a11, &a11_pack,
cntx, bli_cntl_sub_packm_a11( cntl ),
&BLIS_PACKM_SINGLE_THREADED );
bli_packv_int( &x1, &x1_pack,
cntx, bli_cntl_sub_packv_x1( cntl ) );
// x1 = x1 / tril( A11 );
bli_trsv_int( &BLIS_ONE,
&a11_pack,
&x1_pack,
cntx,
bli_cntl_sub_trsv( cntl ) );
// x0 = x0 - A01 * x1;
bli_gemv_int( BLIS_NO_TRANSPOSE,
BLIS_NO_CONJUGATE,
&BLIS_MINUS_ONE,
&a01,
&x1_pack,
&BLIS_ONE,
&x0,
cntx,
bli_cntl_sub_gemv_cp( cntl ) );
// Copy/unpack x1 (if x1 was packed).
bli_unpackv_int( &x1_pack, &x1,
cntx, bli_cntl_sub_unpackv_x1( cntl ) );
}
// If any packing buffers were acquired within packm, release them back
// to the memory manager.
bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) );
bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) );
}
cython-blis-1.0.0/blis/_src/frame/3/ 0000775 0000000 0000000 00000000000 14634250137 0017063 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/bli_l3.h 0000664 0000000 0000000 00000006220 14634250137 0020400 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_l3_cntl.h"
#include "bli_l3_check.h"
#include "bli_l3_int.h"
#include "bli_l3_packab.h"
// Define function types.
//#include "bli_l3_ft_ex.h"
#include "bli_l3_ft_ukr.h"
#include "bli_l3_oft.h"
#include "bli_l3_oft_var.h"
#include "bli_l3_blocksize.h"
#include "bli_l3_direct.h"
#include "bli_l3_prune.h"
#include "bli_l3_schema.h"
// Prototype object APIs (basic and expert).
#include "bli_l3_oapi.h"
#include "bli_l3_oapi_ex.h"
// Prototype typed APIs (basic and expert).
#include "bli_l3_tapi.h"
#include "bli_l3_tapi_ex.h"
// Define function types for small/unpacked handlers/kernels.
#include "bli_l3_sup_oft.h"
#include "bli_l3_sup_ft_ker.h"
// Define static edge case logic for use in small/unpacked kernels.
//#include "bli_l3_sup_edge.h"
// Prototype object API to small/unpacked matrix dispatcher.
#include "bli_l3_sup.h"
// Prototype reference implementation of small/unpacked matrix handler.
#include "bli_l3_sup_ref.h"
#include "bli_l3_sup_int.h"
#include "bli_l3_sup_vars.h"
#include "bli_l3_sup_packm_a.h"
#include "bli_l3_sup_packm_b.h"
#include "bli_l3_sup_packm_var.h"
// Prototype microkernel wrapper APIs.
#include "bli_l3_ukr_oapi.h"
#include "bli_l3_ukr_tapi.h"
// Generate function pointer arrays for tapi microkernel functions.
#include "bli_l3_ukr_fpa.h"
// Operation-specific headers.
#include "bli_gemm.h"
#include "bli_hemm.h"
#include "bli_symm.h"
#include "bli_trmm.h"
#include "bli_trmm3.h"
#include "bli_trsm.h"
#include "bli_gemmt.h"
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_blocksize.c 0000664 0000000 0000000 00000024615 14634250137 0022450 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
dim_t bli_l3_determine_kc
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* a,
obj_t* b,
bszid_t bszid,
cntx_t* cntx,
cntl_t* cntl
)
{
opid_t family = bli_cntl_family( cntl );
if ( family == BLIS_GEMM )
return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
else if ( family == BLIS_GEMMT )
return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx );
else if ( family == BLIS_TRMM )
return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx );
else if ( family == BLIS_TRSM )
return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx );
// This should never execute.
return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
}
// -----------------------------------------------------------------------------
//
// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize
// function to determine the kc blocksize so that we can implement the
// "nudging" of kc to be a multiple of mr or nr, as needed.
//
#undef GENFRONT
#define GENFRONT( opname, l3op ) \
\
dim_t PASTEMAC0(opname) \
( \
dir_t direct, \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
) \
{ \
if ( direct == BLIS_FWD ) \
return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \
else \
return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \
}
GENFRONT( gemm_determine_kc, gemm )
GENFRONT( gemmt_determine_kc, gemmt )
GENFRONT( trmm_determine_kc, trmm )
GENFRONT( trsm_determine_kc, trsm )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname, chdir ) \
\
dim_t PASTEMAC0(opname) \
( \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
) \
{ \
num_t dt; \
blksz_t* bsize; \
dim_t mnr; \
dim_t b_alg, b_max; \
dim_t b_use; \
\
/* bli_*_determine_kc_f():
We assume that this function is being called from an algorithm that
is moving "forward" (ie: top to bottom, left to right, top-left
to bottom-right). */ \
\
/* bli_*_determine_kc_b():
We assume that this function is being called from an algorithm that
is moving "backward" (ie: bottom to top, right to left, bottom-right
to top-left). */ \
\
/* Extract the execution datatype and use it to query the corresponding
blocksize and blocksize maximum values from the blksz_t object. */ \
dt = bli_obj_exec_dt( a ); \
bsize = bli_cntx_get_blksz( bszid, cntx ); \
b_alg = bli_blksz_get_def( dt, bsize ); \
b_max = bli_blksz_get_max( dt, bsize ); \
\
/* Nudge the default and maximum kc blocksizes up to the nearest
multiple of MR if A is Hermitian or symmetric, or NR if B is
Hermitian or symmetric. If neither case applies, then we leave
the blocksizes unchanged. */ \
if ( bli_obj_root_is_herm_or_symm( a ) ) \
{ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
} \
else if ( bli_obj_root_is_herm_or_symm( b ) ) \
{ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
} \
\
/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
in bli_blksz.c */ \
b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
\
return b_use; \
}
GENFRONT( gemm_determine_kc_f, f )
GENFRONT( gemm_determine_kc_b, b )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname, chdir ) \
\
dim_t PASTEMAC0(opname) \
( \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
) \
{ \
num_t dt; \
blksz_t* bsize; \
dim_t b_alg, b_max; \
dim_t b_use; \
\
/* bli_*_determine_kc_f():
We assume that this function is being called from an algorithm that
is moving "forward" (ie: top to bottom, left to right, top-left
to bottom-right). */ \
\
/* bli_*_determine_kc_b():
We assume that this function is being called from an algorithm that
is moving "backward" (ie: bottom to top, right to left, bottom-right
to top-left). */ \
\
/* Extract the execution datatype and use it to query the corresponding
blocksize and blocksize maximum values from the blksz_t object. */ \
dt = bli_obj_exec_dt( a ); \
bsize = bli_cntx_get_blksz( bszid, cntx ); \
b_alg = bli_blksz_get_def( dt, bsize ); \
b_max = bli_blksz_get_max( dt, bsize ); \
\
/* Notice that for gemmt, we do not need to perform any special handling
for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
\
/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
in bli_blksz.c */ \
b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
\
return b_use; \
}
GENFRONT( gemmt_determine_kc_f, f )
GENFRONT( gemmt_determine_kc_b, b )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname, chdir ) \
\
dim_t PASTEMAC0(opname) \
( \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
) \
{ \
num_t dt; \
blksz_t* bsize; \
dim_t mnr; \
dim_t b_alg, b_max; \
dim_t b_use; \
\
/* bli_*_determine_kc_f():
We assume that this function is being called from an algorithm that
is moving "forward" (ie: top to bottom, left to right, top-left
to bottom-right). */ \
\
/* bli_*_determine_kc_b():
We assume that this function is being called from an algorithm that
is moving "backward" (ie: bottom to top, right to left, bottom-right
to top-left). */ \
\
/* Extract the execution datatype and use it to query the corresponding
blocksize and blocksize maximum values from the blksz_t object. */ \
dt = bli_obj_exec_dt( a ); \
bsize = bli_cntx_get_blksz( bszid, cntx ); \
b_alg = bli_blksz_get_def( dt, bsize ); \
b_max = bli_blksz_get_max( dt, bsize ); \
\
/* Nudge the default and maximum kc blocksizes up to the nearest
multiple of MR if the triangular matrix is on the left, or NR
if the triangular matrix is one the right. */ \
if ( bli_obj_root_is_triangular( a ) ) \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
else \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
\
/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
in bli_blksz.c */ \
b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
\
return b_use; \
}
GENFRONT( trmm_determine_kc_f, f )
GENFRONT( trmm_determine_kc_b, b )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname, chdir ) \
\
dim_t PASTEMAC0(opname) \
( \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
) \
{ \
num_t dt; \
blksz_t* bsize; \
dim_t mnr; \
dim_t b_alg, b_max; \
dim_t b_use; \
\
/* bli_*_determine_kc_f():
We assume that this function is being called from an algorithm that
is moving "forward" (ie: top to bottom, left to right, top-left
to bottom-right). */ \
\
/* bli_*_determine_kc_b():
We assume that this function is being called from an algorithm that
is moving "backward" (ie: bottom to top, right to left, bottom-right
to top-left). */ \
\
/* Extract the execution datatype and use it to query the corresponding
blocksize and blocksize maximum values from the blksz_t object. */ \
dt = bli_obj_exec_dt( a ); \
bsize = bli_cntx_get_blksz( bszid, cntx ); \
b_alg = bli_blksz_get_def( dt, bsize ); \
b_max = bli_blksz_get_max( dt, bsize ); \
\
/* Nudge the default and maximum kc blocksizes up to the nearest
multiple of MR. We always use MR (rather than sometimes using NR)
because even when the triangle is on the right, packing of that
matrix uses MR, since only left-side trsm micro-kernels are
supported. */ \
mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
b_max = bli_align_dim_to_mult( b_max, mnr ); \
\
/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
in bli_blksz.c */ \
b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
\
return b_use; \
}
GENFRONT( trsm_determine_kc_f, f )
GENFRONT( trsm_determine_kc_b, b )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_blocksize.h 0000664 0000000 0000000 00000005300 14634250137 0022443 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
dim_t bli_l3_determine_kc
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* a,
obj_t* b,
bszid_t bszid,
cntx_t* cntx,
cntl_t* cntl
);
#undef GENPROT
#define GENPROT( opname ) \
\
dim_t PASTEMAC0(opname) \
( \
dir_t direct, \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
);
GENPROT( gemm_determine_kc )
GENPROT( gemmt_determine_kc )
GENPROT( trmm_determine_kc )
GENPROT( trsm_determine_kc )
#undef GENPROT
#define GENPROT( opname ) \
\
dim_t PASTEMAC0(opname) \
( \
dim_t i, \
dim_t dim, \
obj_t* a, \
obj_t* b, \
bszid_t bszid, \
cntx_t* cntx \
);
GENPROT( gemm_determine_kc_f )
GENPROT( gemm_determine_kc_b )
GENPROT( gemmt_determine_kc_f )
GENPROT( gemmt_determine_kc_b )
GENPROT( trmm_determine_kc_f )
GENPROT( trmm_determine_kc_b )
GENPROT( trsm_determine_kc_f )
GENPROT( trsm_determine_kc_b )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_check.c 0000664 0000000 0000000 00000032043 14634250137 0021532 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
//err_t e_val;
// Check basic properties of the operation.
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
// Check object structure.
// NOTE: Can't perform these checks as long as bli_gemm_check() is called
// from bli_l3_int(), which is in the execution path for structured
// level-3 operations such as hemm.
//e_val = bli_check_general_object( a );
//bli_check_error_code( e_val );
//e_val = bli_check_general_object( b );
//bli_check_error_code( e_val );
}
void bli_gemmt_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Check basic properties of the operation.
bli_gemmt_basic_check( alpha, a, b, beta, c, cntx );
// Check matrix squareness.
e_val = bli_check_square_object( c );
bli_check_error_code( e_val );
}
void bli_hemm_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
// Check object structure.
e_val = bli_check_hermitian_object( a );
bli_check_error_code( e_val );
}
void bli_herk_check
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
obj_t ah;
// Alias A to A^H so we can perform dimension checks.
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
// Check basic properties of the operation.
bli_herk_basic_check( alpha, a, &ah, beta, c, cntx );
// Check for real-valued alpha and beta.
e_val = bli_check_real_valued_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_real_valued_object( beta );
bli_check_error_code( e_val );
// Check matrix structure.
e_val = bli_check_hermitian_object( c );
bli_check_error_code( e_val );
}
void bli_her2k_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
obj_t ah, bh;
// Alias A and B to A^H and B^H so we can perform dimension checks.
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh );
// Check basic properties of the operation.
bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx );
// Check for real-valued beta.
e_val = bli_check_real_valued_object( beta );
bli_check_error_code( e_val );
// Check matrix structure.
e_val = bli_check_hermitian_object( c );
bli_check_error_code( e_val );
}
void bli_symm_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Check basic properties of the operation.
bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
// Check object structure.
e_val = bli_check_symmetric_object( a );
bli_check_error_code( e_val );
}
void bli_syrk_check
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
obj_t at;
// Alias A to A^T so we can perform dimension checks.
bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at );
// Check basic properties of the operation.
bli_herk_basic_check( alpha, a, &at, beta, c, cntx );
// Check matrix structure.
e_val = bli_check_symmetric_object( c );
bli_check_error_code( e_val );
}
void bli_syr2k_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
obj_t at, bt;
// Alias A and B to A^T and B^T so we can perform dimension checks.
bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at );
bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt );
// Check basic properties of the operation.
bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx );
// Check matrix structure.
e_val = bli_check_symmetric_object( c );
bli_check_error_code( e_val );
}
void bli_trmm3_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx );
// Check object structure.
e_val = bli_check_triangular_object( a );
bli_check_error_code( e_val );
}
void bli_trmm_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx
)
{
err_t e_val;
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
// Check object structure.
e_val = bli_check_triangular_object( a );
bli_check_error_code( e_val );
}
void bli_trsm_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx
)
{
err_t e_val;
// Perform checks common to hemm/symm/trmm/trsm.
bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx );
// Check object structure.
e_val = bli_check_triangular_object( a );
bli_check_error_code( e_val );
}
// -----------------------------------------------------------------------------
void bli_gemm_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform standard checks.
bli_l3_basic_check( alpha, a, b, beta, c, cntx );
// Check object dimensions.
e_val = bli_check_level3_dims( a, b, c );
bli_check_error_code( e_val );
#ifdef BLIS_ENABLE_GEMM_MD
// Skip checking for consistent datatypes between A, B, and C since
// that is totally valid for mixed-datatype gemm.
// When mixing datatypes, make sure that alpha does not have a non-zero
// imaginary component.
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
bli_obj_dt( c ) != bli_obj_dt( b ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) )
if ( !bli_obj_imag_is_zero( alpha ) )
{
bli_print_msg( "Mixed-datatype gemm does not yet support alpha with a non-zero imaginary component. Please contact BLIS developers for further support.", __FILE__, __LINE__ );
bli_abort();
}
#else // BLIS_DISABLE_GEMM_MD
// Check for consistent datatypes.
// NOTE: We only perform these tests when mixed datatype support is
// disabled.
e_val = bli_check_consistent_object_datatypes( c, a );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, b );
bli_check_error_code( e_val );
#endif
}
void bli_gemmt_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform standard checks.
bli_l3_basic_check( alpha, a, b, beta, c, cntx );
// Check object dimensions.
e_val = bli_check_level3_dims( a, b, c );
bli_check_error_code( e_val );
}
void bli_hemm_basic_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform standard checks.
bli_l3_basic_check( alpha, a, b, beta, c, cntx );
// Check object dimensions.
if ( bli_is_left( side ) )
{
e_val = bli_check_level3_dims( a, b, c );
bli_check_error_code( e_val );
}
else // if ( bli_is_right( side ) )
{
e_val = bli_check_level3_dims( b, a, c );
bli_check_error_code( e_val );
}
// Check matrix squareness.
e_val = bli_check_square_object( a );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( c, a );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, b );
bli_check_error_code( e_val );
}
void bli_herk_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* ah,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform standard checks.
bli_l3_basic_check( alpha, a, ah, beta, c, cntx );
// Check object dimensions.
e_val = bli_check_level3_dims( a, ah, c );
bli_check_error_code( e_val );
// Check matrix squareness.
e_val = bli_check_square_object( c );
bli_check_error_code( e_val );
// Check matrix structure.
e_val = bli_check_general_object( a );
bli_check_error_code( e_val );
e_val = bli_check_general_object( ah );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( c, a );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, ah );
bli_check_error_code( e_val );
}
void bli_her2k_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Perform standard checks.
bli_l3_basic_check( alpha, a, bh, beta, c, cntx );
bli_l3_basic_check( alpha, b, ah, beta, c, cntx );
// Check object dimensions.
e_val = bli_check_level3_dims( a, bh, c );
bli_check_error_code( e_val );
e_val = bli_check_level3_dims( b, ah, c );
bli_check_error_code( e_val );
// Check matrix squareness.
e_val = bli_check_square_object( c );
bli_check_error_code( e_val );
// Check matrix structure.
e_val = bli_check_general_object( a );
bli_check_error_code( e_val );
e_val = bli_check_general_object( bh );
bli_check_error_code( e_val );
e_val = bli_check_general_object( b );
bli_check_error_code( e_val );
e_val = bli_check_general_object( ah );
bli_check_error_code( e_val );
// Check for consistent datatypes.
e_val = bli_check_consistent_object_datatypes( c, a );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, ah );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, b );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_datatypes( c, bh );
bli_check_error_code( e_val );
}
void bli_l3_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_noninteger_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_noninteger_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( c );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_scalar_object( alpha );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( beta );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( c );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( alpha );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( beta );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( c );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_check.h 0000664 0000000 0000000 00000007456 14634250137 0021551 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based check functions.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx \
);
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( her2k )
GENPROT( syr2k )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx \
);
GENPROT( hemm )
GENPROT( symm )
GENPROT( trmm3 )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx \
);
GENPROT( herk )
GENPROT( syrk )
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC(opname,_check) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx \
);
GENPROT( trmm )
GENPROT( trsm )
// -----------------------------------------------------------------------------
void bli_gemm_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
void bli_gemmt_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
void bli_hemm_basic_check
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
void bli_herk_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* ah,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
void bli_her2k_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* bh,
obj_t* b,
obj_t* ah,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
void bli_l3_basic_check
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_cntl.c 0000664 0000000 0000000 00000007131 14634250137 0021415 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_l3_cntl_create_if
(
opid_t family,
pack_t schema_a,
pack_t schema_b,
obj_t* a,
obj_t* b,
obj_t* c,
rntm_t* rntm,
cntl_t* cntl_orig,
cntl_t** cntl_use
)
{
// If the control tree pointer is NULL, we construct a default
// tree as a function of the operation family.
if ( cntl_orig == NULL )
{
if ( family == BLIS_GEMM ||
family == BLIS_GEMMT ||
family == BLIS_TRMM )
{
*cntl_use = bli_gemm_cntl_create
(
rntm,
family,
schema_a,
schema_b,
bli_obj_ker_fn( c )
);
}
else // if ( family == BLIS_TRSM )
{
side_t side;
if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT;
else side = BLIS_RIGHT;
*cntl_use = bli_trsm_cntl_create
(
rntm,
side,
schema_a,
schema_b,
bli_obj_ker_fn( c )
);
}
}
else
{
// If the user provided a control tree, create a copy and use it
// instead (so that threads can use its local tree as a place to
// cache things like pack mem_t entries).
*cntl_use = bli_cntl_copy( rntm, cntl_orig );
// Recursively set the family fields of the newly copied control tree
// nodes.
bli_cntl_mark_family( family, *cntl_use );
}
}
void bli_l3_cntl_free
(
rntm_t* rntm,
cntl_t* cntl_use,
thrinfo_t* thread
)
{
// NOTE: We don't actually need to call separate _cntl_free() functions
// for gemm and trsm; it is merely an unnecessary mirroring of behavior
// from the _create() side (which must call different functions based
// on the family).
opid_t family = bli_cntl_family( cntl_use );
if ( family == BLIS_GEMM ||
family == BLIS_GEMMT ||
family == BLIS_TRMM )
{
bli_gemm_cntl_free( rntm, cntl_use, thread );
}
else // if ( family == BLIS_TRSM )
{
bli_trsm_cntl_free( rntm, cntl_use, thread );
}
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_cntl.h 0000664 0000000 0000000 00000004163 14634250137 0021424 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype conditional control tree creation functions.
//
void bli_l3_cntl_create_if
(
opid_t family,
pack_t schema_a,
pack_t schema_b,
obj_t* a,
obj_t* b,
obj_t* c,
rntm_t* rntm,
cntl_t* cntl_orig,
cntl_t** cntl_use
);
void bli_l3_cntl_free
(
rntm_t* rntm,
cntl_t* cntl_use,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_direct.c 0000664 0000000 0000000 00000007424 14634250137 0021734 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
dir_t bli_l3_direct
(
obj_t* a,
obj_t* b,
obj_t* c,
cntl_t* cntl
)
{
// Query the operation family.
opid_t family = bli_cntl_family( cntl );
if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c );
else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c );
else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c );
else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c );
// This should never execute.
return BLIS_FWD;
}
// -----------------------------------------------------------------------------
dir_t bli_gemm_direct
(
obj_t* a,
obj_t* b,
obj_t* c
)
{
// For gemm, movement may be forwards (or backwards).
return BLIS_FWD;
}
dir_t bli_gemmt_direct
(
obj_t* a,
obj_t* b,
obj_t* c
)
{
// For gemmt, movement may be forwards (or backwards).
return BLIS_FWD;
}
dir_t bli_trmm_direct
(
obj_t* a,
obj_t* b,
obj_t* c
)
{
dir_t direct;
// For trmm, movement for the parameter cases is as follows:
// - left,lower: backwards
// - left,upper: forwards
// - right,lower: forwards
// - right,upper: backwards
if ( bli_obj_root_is_triangular( a ) )
{
if ( bli_obj_root_is_lower( a ) ) direct = BLIS_BWD;
else direct = BLIS_FWD;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
if ( bli_obj_root_is_lower( b ) ) direct = BLIS_FWD;
else direct = BLIS_BWD;
}
return direct;
}
dir_t bli_trsm_direct
(
obj_t* a,
obj_t* b,
obj_t* c
)
{
dir_t direct;
// For trsm, movement for the parameter cases is as follows:
// - left,lower: forwards
// - left,upper: backwards
// - right,lower: backwards
// - right,upper: forwards
if ( bli_obj_root_is_triangular( a ) )
{
if ( bli_obj_root_is_lower( a ) ) direct = BLIS_FWD;
else direct = BLIS_BWD;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
if ( bli_obj_root_is_lower( b ) ) direct = BLIS_BWD;
else direct = BLIS_FWD;
}
return direct;
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_direct.h 0000664 0000000 0000000 00000004105 14634250137 0021732 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
dir_t bli_l3_direct
(
obj_t* a,
obj_t* b,
obj_t* c,
cntl_t* cntl
);
// -----------------------------------------------------------------------------
#undef GENPROT
#define GENPROT( opname ) \
\
dir_t PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
);
GENPROT( gemm_direct )
GENPROT( gemmt_direct )
GENPROT( trmm_direct )
GENPROT( trsm_direct )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ft_ukr.h 0000664 0000000 0000000 00000006307 14634250137 0021760 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_FT_UKR_H
#define BLIS_L3_FT_UKR_H
//
// -- Level-3 micro-kernel function types --------------------------------------
//
// gemm
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( gemm )
// gemmtrsm_[lu]
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a1x, \
ctype* restrict a11, \
ctype* restrict bx1, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( gemmtrsm )
// trsm_[lu]
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( trsm )
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ind.c 0000664 0000000 0000000 00000017243 14634250137 0021234 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// This array tracks whether a particular operation is implemented for each of
// the induced methods.
static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] =
{
/* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */
/* 1m */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE },
/* nat */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE }
};
//
// NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2.
//
// BLIS provides APIs to modify this state during runtime. So, it's possible for one
// application thread to modify the state before another starts the corresponding
// BLIS operation. This is solved by making the induced method status array local to
// threads.
static BLIS_THREAD_LOCAL
bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] =
{
/* gemm gemmt hemm herk her2k symm
syrk syr2k trmm3 trmm trsm */
/* c z */
/* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE},
{FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} },
/* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE},
{TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} },
};
// -----------------------------------------------------------------------------
#undef GENFUNC
#define GENFUNC( opname, optype ) \
\
ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \
{ \
return bli_l3_ind_oper_find_avail( optype, dt ); \
}
//bool PASTEMAC(opname,ind_has_avail)( num_t dt )
//{
// return bli_ind_oper_has_avail( optype, dt );
//}
GENFUNC( gemm, BLIS_GEMM )
GENFUNC( gemmt, BLIS_GEMMT )
GENFUNC( hemm, BLIS_HEMM )
GENFUNC( symm, BLIS_SYMM )
GENFUNC( trmm3, BLIS_TRMM3 )
GENFUNC( trmm, BLIS_TRMM )
GENFUNC( trsm, BLIS_TRSM )
// -----------------------------------------------------------------------------
#if 0
bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt )
{
bool enabled;
bool stat;
// If the datatype is real, it is never available.
if ( !bli_is_complex( dt ) ) return FALSE;
enabled = bli_l3_ind_oper_is_impl( oper, method );
stat = bli_l3_ind_oper_get_enable( oper, method, dt );
return ( enabled == TRUE && stat == TRUE );
}
#endif
// -----------------------------------------------------------------------------
ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt )
{
bli_init_once();
ind_t im;
// If the datatype is real, return native execution.
if ( !bli_is_complex( dt ) ) return BLIS_NAT;
// If the operation is not level-3, return native execution.
if ( !bli_opid_is_level3( oper ) ) return BLIS_NAT;
// Iterate over all induced methods and search for the first one
// that is available (ie: both implemented and enabled) for the
// current operation and datatype.
for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im )
{
bool enabled = bli_l3_ind_oper_is_impl( oper, im );
bool stat = bli_l3_ind_oper_get_enable( oper, im, dt );
if ( enabled == TRUE &&
stat == TRUE ) return im;
}
// This return statement should never execute since the native index
// should be found even if all induced methods are unavailable. We
// include it simply to avoid a compiler warning.
return BLIS_NAT;
}
// -----------------------------------------------------------------------------
void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status )
{
opid_t iop;
if ( !bli_is_complex( dt ) ) return;
// Iterate over all level-3 operation ids.
for ( iop = 0; iop < BLIS_NUM_LEVEL3_OPS; ++iop )
{
bli_l3_ind_oper_set_enable( iop, method, dt, status );
}
}
// -----------------------------------------------------------------------------
void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt )
{
ind_t im;
if ( !bli_is_complex( dt ) ) return;
if ( !bli_opid_is_level3( oper ) ) return;
for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im )
{
// Native execution should always stay enabled.
if ( im == BLIS_NAT ) continue;
// When we come upon the requested method, enable it for the given
// operation and datatype. Otherwise, disable it.
if ( im == method )
bli_l3_ind_oper_set_enable( oper, im, dt, TRUE );
else
bli_l3_ind_oper_set_enable( oper, im, dt, FALSE );
}
}
void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status )
{
ind_t im;
if ( !bli_is_complex( dt ) ) return;
if ( !bli_opid_is_level3( oper ) ) return;
for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im )
{
// Native execution should always stay enabled.
if ( im != BLIS_NAT )
bli_l3_ind_oper_set_enable( oper, im, dt, status );
}
}
// -----------------------------------------------------------------------------
// A mutex to allow synchronous access to the bli_l3_ind_oper_st array.
static bli_pthread_mutex_t oper_st_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status )
{
num_t idt;
if ( !bli_is_complex( dt ) ) return;
if ( !bli_opid_is_level3( oper ) ) return;
// Disallow changing status of native execution.
if ( method == BLIS_NAT ) return;
idt = bli_ind_map_cdt_to_index( dt );
// Acquire the mutex protecting bli_l3_ind_oper_st.
bli_pthread_mutex_lock( &oper_st_mutex );
// BEGIN CRITICAL SECTION
{
bli_l3_ind_oper_st[ method ][ oper ][ idt ] = status;
}
// END CRITICAL SECTION
// Release the mutex protecting bli_l3_ind_oper_st.
bli_pthread_mutex_unlock( &oper_st_mutex );
}
bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt )
{
num_t idt = bli_ind_map_cdt_to_index( dt );
bool r_val;
{
r_val = bli_l3_ind_oper_st[ method ][ oper ][ idt ];
}
return r_val;
}
// -----------------------------------------------------------------------------
bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method )
{
return bli_l3_ind_oper_impl[ method ][ oper ];
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ind.h 0000664 0000000 0000000 00000005363 14634250137 0021241 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_IND_H
#define BLIS_L3_IND_H
// -----------------------------------------------------------------------------
#undef GENPROT
#define GENPROT( opname ) \
\
ind_t PASTEMAC(opname,ind_find_avail)( num_t dt );
/*bool PASTEMAC(opname,ind_has_avail)( num_t dt ); */
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( hemm )
GENPROT( symm )
GENPROT( trmm3 )
GENPROT( trmm )
GENPROT( trsm )
// -----------------------------------------------------------------------------
//bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt );
ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt );
void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status );
void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status );
void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status );
bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt );
bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method );
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ind_ukr.h 0000664 0000000 0000000 00000006255 14634250137 0022123 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-3 micro-kernels.
//
// 1m micro-kernels
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a1x, \
ctype* restrict a11, \
ctype* restrict bx1, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm1m_u_ukr_name )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name )
INSERT_GENTPROT_BASIC0( trsm1m_u_ukr_name )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_int.c 0000664 0000000 0000000 00000011131 14634250137 0021242 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_l3_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t a_local;
obj_t b_local;
obj_t c_local;
// Return early if the current control tree node is NULL.
if ( bli_cntl_is_null( cntl ) ) return;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_basic_check( alpha, a, b, beta, c, cntx );
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
// If A or B is marked as being filled with zeros, scale C by beta and
// return early.
if ( bli_obj_is_zeros( a ) ||
bli_obj_is_zeros( b ) )
{
// This should never execute.
bli_abort();
if ( bli_thread_am_ochief( thread ) )
bli_scalm( beta, c );
bli_thread_barrier( thread );
return;
}
// Alias A, B, and C in case we need to update attached scalars.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Ensure that a valid packing function is set on A and B.
if ( !bli_obj_pack_fn( &a_local ) )
bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local );
if ( !bli_obj_pack_fn( &b_local ) )
bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local );
// If we are about to call a leaf-level implementation, and matrix C
// still needs a transposition, then we must induce one by swapping the
// strides and dimensions. Note that this transposition would normally
// be handled explicitly in the packing of C, but if C is not being
// packed, this is our last chance to handle the transposition.
//if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) )
if ( bli_obj_has_trans( c ) )
{
bli_obj_induce_trans( &c_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local );
}
// If alpha is non-unit, typecast and apply it to the scalar attached
// to B, unless it happens to be triangular.
if ( bli_obj_root_is_triangular( b ) )
{
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
bli_obj_scalar_apply_scalar( alpha, &a_local );
}
else // if ( bli_obj_root_is_triangular( b ) )
{
if ( !bli_obj_equals( alpha, &BLIS_ONE ) )
bli_obj_scalar_apply_scalar( alpha, &b_local );
}
// If beta is non-unit, typecast and apply it to the scalar attached
// to C.
if ( !bli_obj_equals( beta, &BLIS_ONE ) )
bli_obj_scalar_apply_scalar( beta, &c_local );
// Create the next node in the thrinfo_t structure.
bli_thrinfo_grow( rntm, cntl, thread );
// Extract the function pointer from the current control tree node.
l3_var_oft f = bli_cntl_var_func( cntl );
// Invoke the variant.
f
(
&a_local,
&b_local,
&c_local,
cntx,
rntm,
cntl,
thread
);
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_int.h 0000664 0000000 0000000 00000003542 14634250137 0021256 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_l3_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_oapi.c 0000664 0000000 0000000 00000006413 14634250137 0021407 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based interfaces (basic).
//
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( alpha, a, b, beta, c, NULL, NULL ); \
}
GENFRONT( gemm )
GENFRONT( gemmt )
GENFRONT( her2k )
GENFRONT( syr2k )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( side, alpha, a, b, beta, c, NULL, NULL ); \
}
GENFRONT( hemm )
GENFRONT( symm )
GENFRONT( trmm3 )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( alpha, a, beta, c, NULL, NULL ); \
}
GENFRONT( herk )
GENFRONT( syrk )
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC(opname,_ex)( side, alpha, a, b, NULL, NULL ); \
}
GENFRONT( trmm )
GENFRONT( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_oapi.h 0000664 0000000 0000000 00000005361 14634250137 0021415 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces (basic).
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
);
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( her2k )
GENPROT( syr2k )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c \
);
GENPROT( hemm )
GENPROT( symm )
GENPROT( trmm3 )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c \
);
GENPROT( herk )
GENPROT( syrk )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b \
);
GENPROT( trmm )
GENPROT( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_oapi_ex.c 0000664 0000000 0000000 00000041161 14634250137 0022102 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define object-based interfaces (expert).
//
// If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be
// defined in the sandbox environment.
#ifndef BLIS_ENABLE_SANDBOX
void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// If the rntm is non-NULL, it may indicate that we should forgo sup
// handling altogether.
bool enable_sup = TRUE;
if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm );
if ( enable_sup )
{
// Execute the small/unpacked oapi handler. If it finds that the problem
// does not fall within the thresholds that define "small", or for some
// other reason decides not to use the small/unpacked implementation,
// the function returns with BLIS_FAILURE, which causes execution to
// proceed towards the conventional implementation.
err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm );
if ( result == BLIS_SUCCESS )
{
return;
}
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( c );
ind_t im = BLIS_NAT;
// If each matrix operand has a complex storage datatype, try to get an
// induced method (if one is available and enabled). NOTE: Allowing
// precisions to vary while using 1m, which is what we do here, is unique
// to gemm; other level-3 operations use 1m only if all storage datatypes
// are equal (and they ignore the computation precision).
if ( bli_obj_is_complex( c ) &&
bli_obj_is_complex( a ) &&
bli_obj_is_complex( b ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_gemmind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
}
#endif
void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( c );
ind_t im = BLIS_NAT;
// If all matrix operands are complex and of the same storage datatype, try
// to get an induced method (if one is available and enabled).
if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_dt( b ) == bli_obj_dt( c ) &&
bli_obj_is_complex( c ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_gemmtind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_gemmt_check( alpha, a, b, beta, c, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL );
}
void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
obj_t ah;
obj_t bh;
obj_t alphah;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_her2k_check( alpha, a, b, beta, c, cntx );
bli_obj_alias_to( alpha, &alphah );
bli_obj_toggle_conj( &alphah );
bli_obj_alias_to( a, &ah );
bli_obj_toggle_trans( &ah );
bli_obj_toggle_conj( &ah );
bli_obj_alias_to( b, &bh );
bli_obj_toggle_trans( &bh );
bli_obj_toggle_conj( &bh );
// Invoke gemmt twice, using beta only the first time.
PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bh, beta, c, cntx, rntm );
PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm );
// The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for
// the diagonal elements. Mathematically, the imaginary components of
// diagonal elements of a Hermitian rank-2k product should always be
// zero. However, in practice, they sometimes accumulate meaningless
// non-zero values. To prevent this, we explicitly set those values
// to zero before returning.
bli_setid( &BLIS_ZERO, c );
}
void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
obj_t at;
obj_t bt;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syr2k_check( alpha, a, b, beta, c, cntx );
bli_obj_alias_to( b, &bt );
bli_obj_toggle_trans( &bt );
bli_obj_alias_to( a, &at );
bli_obj_toggle_trans( &at );
// Invoke gemmt twice, using beta only the first time.
PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt, beta, c, cntx, rntm );
PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm );
}
void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( c );
ind_t im = BLIS_NAT;
// If all matrix operands are complex and of the same storage datatype, try
// to get an induced method (if one is available and enabled).
if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_dt( b ) == bli_obj_dt( c ) &&
bli_obj_is_complex( c ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_hemmind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_hemm_check( side, alpha, a, b, beta, c, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
}
void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( c );
ind_t im = BLIS_NAT;
// If all matrix operands are complex and of the same storage datatype, try
// to get an induced method (if one is available and enabled).
if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_dt( b ) == bli_obj_dt( c ) &&
bli_obj_is_complex( c ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_symmind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_symm_check( side, alpha, a, b, beta, c, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
}
void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( c );
ind_t im = BLIS_NAT;
// If all matrix operands are complex and of the same storage datatype, try
// to get an induced method (if one is available and enabled).
if ( bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_dt( b ) == bli_obj_dt( c ) &&
bli_obj_is_complex( c ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_trmm3ind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_trmm3_check( side, alpha, a, b, beta, c, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL );
}
void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
obj_t ah;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_herk_check( alpha, a, beta, c, cntx );
bli_obj_alias_to( a, &ah );
bli_obj_toggle_trans( &ah );
bli_obj_toggle_conj( &ah );
PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm );
// The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
// diagonal elements. Mathematically, the imaginary components of
// diagonal elements of a Hermitian rank-k product should always be
// zero. However, in practice, they sometimes accumulate meaningless
// non-zero values. To prevent this, we explicitly set those values
// to zero before returning.
bli_setid( &BLIS_ZERO, c );
}
void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
(
obj_t* alpha,
obj_t* a,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
obj_t at;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_syrk_check( alpha, a, beta, c, cntx );
bli_obj_alias_to( a, &at );
bli_obj_toggle_trans( &at );
PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm );
}
void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( b );
ind_t im = BLIS_NAT;
// If all matrix operands are complex and of the same storage datatype, try
// to get an induced method (if one is available and enabled).
if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
bli_obj_is_complex( b ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_trmmind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_trmm_check( side, alpha, a, b, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL );
}
void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
rntm_t* rntm
)
{
bli_init_once();
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// Default to using native execution.
num_t dt = bli_obj_dt( b );
ind_t im = BLIS_NAT;
// If all matrix operands are complex and of the same storage datatype, try
// to get an induced method (if one is available and enabled).
if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
bli_obj_is_complex( b ) )
{
// Find the highest priority induced method that is both enabled and
// available for the current operation. (If an induced method is
// available but not enabled, or simply unavailable, BLIS_NAT will
// be returned here.)
im = bli_trsmind_find_avail( dt );
}
// If necessary, obtain a valid context from the gks using the induced
// method id determined above.
if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt );
// Check the operands.
if ( bli_error_checking_is_enabled() )
bli_trsm_check( side, alpha, a, b, cntx );
// Invoke the operation's front-end and request the default control tree.
bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL );
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_oapi_ex.h 0000664 0000000 0000000 00000005752 14634250137 0022115 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces (expert).
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( gemm )
GENPROT( gemmt )
GENPROT( her2k )
GENPROT( syr2k )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( hemm )
GENPROT( symm )
GENPROT( trmm3 )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( herk )
GENPROT( syrk )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENPROT( trmm )
GENPROT( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_oft.h 0000664 0000000 0000000 00000005605 14634250137 0021256 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_OFT_H
#define BLIS_L3_OFT_H
//
// -- Level-3 object function types --------------------------------------------
//
// gemm
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_oft)) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENTDEF( gemm )
GENTDEF( gemmt )
GENTDEF( her2k )
GENTDEF( syr2k )
// hemm, symm, trmm3
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_oft)) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENTDEF( hemm )
GENTDEF( symm )
GENTDEF( trmm3 )
// herk, syrk
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_oft)) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENTDEF( herk )
GENTDEF( syrk )
// trmm, trsm
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_oft)) \
( \
side_t side, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENTDEF( trmm )
GENTDEF( trsm )
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_oft_var.h 0000664 0000000 0000000 00000004013 14634250137 0022116 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_OFT_VAR_H
#define BLIS_L3_OFT_VAR_H
//
// -- Level-3 variant function types -------------------------------------------
//
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef void (*PASTECH(opname,_var_oft)) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENTDEF( l3 )
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_packab.c 0000664 0000000 0000000 00000006401 14634250137 0021675 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_l3_packa
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t a_local, a_pack;
bli_obj_alias_to( a, &a_local );
if ( bli_obj_has_trans( a ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
// Pack matrix A according to the control tree node.
bli_packm_int
(
&a_local,
&a_pack,
cntx,
rntm,
cntl,
thread
);
// Proceed with execution using packed matrix A.
bli_l3_int
(
&BLIS_ONE,
&a_pack,
b,
&BLIS_ONE,
c,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
// -----------------------------------------------------------------------------
void bli_l3_packb
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t bt_local, bt_pack;
// We always pass B^T to bli_l3_packm.
bli_obj_alias_to( b, &bt_local );
if ( bli_obj_has_trans( b ) )
{
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local );
}
else
{
bli_obj_induce_trans( &bt_local );
}
// Pack matrix B according to the control tree node.
bli_packm_int
(
&bt_local,
&bt_pack,
cntx,
rntm,
cntl,
thread
);
// Transpose packed object back to B.
bli_obj_induce_trans( &bt_pack );
// Proceed with execution using packed matrix B.
bli_l3_int
(
&BLIS_ONE,
a,
&bt_pack,
&BLIS_ONE,
c,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_packab.h 0000664 0000000 0000000 00000003751 14634250137 0021707 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_l3_packa
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
void bli_l3_packb
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_prune.c 0000664 0000000 0000000 00000014017 14634250137 0021607 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
/*
void bli_l3_prune_unref_mparts_m
(
obj_t* a,
obj_t* b,
obj_t* c,
cntl_t* cntl
)
{
// Query the operation family.
opid_t family = bli_cntl_family( cntl );
if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c );
else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c );
else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c );
}
*/
#undef GENFRONT
#define GENFRONT( dim ) \
\
void PASTEMAC(l3_prune_unref_mparts_,dim) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntl_t* cntl \
) \
{ \
/* Query the operation family. */ \
opid_t family = bli_cntl_family( cntl ); \
\
if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \
else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \
else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \
}
GENFRONT( m )
GENFRONT( n )
GENFRONT( k )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_prune_unref_mparts_m) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
) \
{ \
/* No pruning is necessary for gemm. */ \
} \
void PASTEMAC(opname,_prune_unref_mparts_n) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
) \
{ \
/* No pruning is necessary for gemm. */ \
} \
void PASTEMAC(opname,_prune_unref_mparts_k) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
) \
{ \
/* No pruning is necessary for gemm. */ \
}
GENFRONT( gemm )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_prune_unref_mparts_m) \
( \
obj_t* a, \
obj_t* ah, \
obj_t* c \
) \
{ \
/* Prune any unreferenced part from the subpartition of C (that would
be encountered from partitioning in the m dimension) and adjust the
subpartition of A accordingly. */ \
bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M ); \
} \
void PASTEMAC(opname,_prune_unref_mparts_n) \
( \
obj_t* a, \
obj_t* ah, \
obj_t* c \
) \
{ \
/* Prune any unreferenced part from the subpartition of C (that would
be encountered from partitioning in the n dimension) and adjust the
subpartition of Ah accordingly. */ \
bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N ); \
} \
void PASTEMAC(opname,_prune_unref_mparts_k) \
( \
obj_t* a, \
obj_t* ah, \
obj_t* c \
) \
{ \
/* As long as A and Ah are general in structure, no pruning should be
for the k dimension. */ \
}
GENFRONT( gemmt )
// -----------------------------------------------------------------------------
#undef GENFRONT
#define GENFRONT( opname ) \
\
void PASTEMAC(opname,_prune_unref_mparts_m) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
) \
{ \
/* Prune any unreferenced part from the subpartition of A (that would
be encountered from partitioning in the m dimension) and adjust the
subpartition of C accordingly. */ \
bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M ); \
} \
void PASTEMAC(opname,_prune_unref_mparts_n) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
) \
{ \
/* Prune any unreferenced part from the subpartition of B (that would
be encountered from partitioning in the n dimension) and adjust the
subpartition of C accordingly. */ \
bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N ); \
} \
void PASTEMAC(opname,_prune_unref_mparts_k) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
) \
{ \
/* Prune any unreferenced part from the subpartition of A (that would
be encountered from partitioning in the k dimension) and adjust the
subpartition of B accordingly. */ \
bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M ); \
\
/* Prune any unreferenced part from the subpartition of B (that would
be encountered from partitioning in the k dimension) and adjust the
subpartition of A accordingly. */ \
bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N ); \
}
GENFRONT( trmm )
GENFRONT( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_prune.h 0000664 0000000 0000000 00000004536 14634250137 0021621 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENPROT
#define GENPROT( dim ) \
\
void PASTEMAC(l3_prune_unref_mparts_,dim) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntl_t* cntl \
);
GENPROT( m )
GENPROT( n )
GENPROT( k )
// -----------------------------------------------------------------------------
#undef GENPROT
#define GENPROT( opname, dim ) \
\
void PASTEMAC2(opname,_prune_unref_mparts_,dim) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c \
);
GENPROT( gemm, m )
GENPROT( gemm, n )
GENPROT( gemm, k )
GENPROT( gemmt, m )
GENPROT( gemmt, n )
GENPROT( gemmt, k )
GENPROT( trmm, m )
GENPROT( trmm, n )
GENPROT( trmm, k )
GENPROT( trsm, m )
GENPROT( trsm, n )
GENPROT( trsm, k )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_schema.c 0000664 0000000 0000000 00000006207 14634250137 0021720 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_l3_set_schemas
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
)
{
// Begin with pack schemas for native execution.
pack_t schema_a = BLIS_PACKED_ROW_PANELS;
pack_t schema_b = BLIS_PACKED_COL_PANELS;
// When executing the 1m method, choose the appropriate pack schemas based
// on the microkernel preference encoded within the current cntx_t (which
// was presumably returned by the gks).
if ( bli_cntx_method( cntx ) == BLIS_1M )
{
num_t dt = bli_obj_domain( c ) | bli_obj_comp_prec( c );
// Note that bli_cntx_l3_vir_ukr_prefers_cols_dt() will use the real
// projection of dt to query the preference of the corresponding native
// real-domain microkernel. This is what ultimately determines which
// variant of 1m is applicable.
if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
{
schema_a = BLIS_PACKED_ROW_PANELS_1E;
schema_b = BLIS_PACKED_COL_PANELS_1R;
}
else
{
schema_a = BLIS_PACKED_ROW_PANELS_1R;
schema_b = BLIS_PACKED_COL_PANELS_1E;
}
}
// Embed the schemas into the objects for A and B. This is a sort of hack
// for communicating the desired pack schemas to bli_gemm_cntl_create()
// (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows
// us to subsequently access the schemas from the control tree, which
// hopefully reduces some confusion, particularly in bli_packm_init().
bli_obj_set_pack_schema( schema_a, a );
bli_obj_set_pack_schema( schema_b, b );
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_schema.h 0000664 0000000 0000000 00000003372 14634250137 0021725 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_l3_set_schemas
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup.c 0000664 0000000 0000000 00000015736 14634250137 0021276 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
err_t bli_gemmsup
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Return early if small matrix handling is disabled at configure-time.
#ifdef BLIS_DISABLE_SUP_HANDLING
return BLIS_FAILURE;
#endif
// Return early if this is a mixed-datatype computation.
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
bli_obj_dt( c ) != bli_obj_dt( b ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE;
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Return early if a microkernel preference-induced transposition would
// have been performed and shifted the dimensions outside of the space
// of sup-handled problems.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
// Pass in m and n reversed, which simulates a transposition of the
// entire operation pursuant to the microkernel storage preference.
if ( !bli_cntx_l3_sup_thresh_is_met( dt, n, m, k, cntx ) )
return BLIS_FAILURE;
}
else // ukr_prefers_storage_of( c, ... )
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) )
return BLIS_FAILURE;
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
#if 0
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width_after_trans( a );
const dim_t tm = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx );
const dim_t tn = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
const dim_t tk = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
printf( "dims: %d %d %d (threshs: %d %d %d)\n",
(int)m, (int)n, (int)k, (int)tm, (int)tn, (int)tk );
#endif
// We've now ruled out the following two possibilities:
// - the ukernel prefers the operation as-is, and the sup thresholds are
// unsatisfied.
// - the ukernel prefers a transposed operation, and the sup thresholds are
// unsatisfied after taking into account the transposition.
// This implies that the sup thresholds (at least one of them) are met.
// and the small/unpacked handler should be called.
// NOTE: The sup handler is free to enforce a stricter threshold regime
// if it so chooses, in which case it can/should return BLIS_FAILURE.
// Query the small/unpacked handler from the context and invoke it.
gemmsup_oft gemmsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMM, cntx );
return
gemmsup_fp
(
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}
err_t bli_gemmtsup
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// Return early if small matrix handling is disabled at configure-time.
#ifdef BLIS_DISABLE_SUP_HANDLING
return BLIS_FAILURE;
#endif
// Return early if this is a mixed-datatype computation.
if ( bli_obj_dt( c ) != bli_obj_dt( a ) ||
bli_obj_dt( c ) != bli_obj_dt( b ) ||
bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE;
// Obtain a valid (native) context from the gks if necessary.
// NOTE: This must be done before calling the _check() function, since
// that function assumes the context pointer is valid.
if ( cntx == NULL ) cntx = bli_gks_query_cntx();
// Return early if the problem dimensions exceed their sup thresholds.
// Notice that we do not bother to check whether the microkernel
// prefers or dislikes the storage of C, since the same check is called
// for either way.
{
const num_t dt = bli_obj_dt( c );
const dim_t m = bli_obj_length( c );
const dim_t k = bli_obj_width_after_trans( a );
if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) )
return BLIS_FAILURE;
}
// Initialize a local runtime with global settings if necessary. Note
// that in the case that a runtime is passed in, we make a local copy.
rntm_t rntm_l;
if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
else { rntm_l = *rntm; rntm = &rntm_l; }
// We've now ruled out the possibility that the sup thresholds are
// unsatisfied.
// This implies that the sup thresholds (at least one of them) are met.
// and the small/unpacked handler should be called.
// NOTE: The sup handler is free to enforce a stricter threshold regime
// if it so chooses, in which case it can/should return BLIS_FAILURE.
// Query the small/unpacked handler from the context and invoke it.
gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx );
return
gemmtsup_fp
(
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup.h 0000664 0000000 0000000 00000003744 14634250137 0021277 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
err_t bli_gemmsup
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
err_t bli_gemmtsup
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_ft_ker.h 0000664 0000000 0000000 00000004646 14634250137 0022633 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_FT_KER_H
#define BLIS_L3_SUP_FT_KER_H
//
// -- Level-3 small/unpacked kernel function types -----------------------------
//
// gemmsup
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
INSERT_GENTDEF( gemmsup )
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_int.c 0000664 0000000 0000000 00000034130 14634250137 0022135 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
err_t bli_gemmsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
#if 0
//bli_gemmsup_ref_var2
//bli_gemmsup_ref_var1
#if 0
bli_gemmsup_ref_var1n
#else
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
if ( is_rrr_rrc_rcr_crr )
{
bli_gemmsup_ref_var2m
(
BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
);
}
else
{
bli_gemmsup_ref_var2m
(
BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm
);
}
return BLIS_SUCCESS;
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t mu = m / MR;
const dim_t nu = n / NR;
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
// *requires nudging of nc up to be a multiple of mr.
}
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
bli_gemmsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
// *requires nudging of mc up to be a multiple of nr.
}
}
// Return success so that the caller knows that we computed the solution.
return BLIS_SUCCESS;
}
// -----------------------------------------------------------------------------
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR ||
stor_id == BLIS_RRC ||
stor_id == BLIS_RCR ||
stor_id == BLIS_CRR );
const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
const num_t dt = bli_obj_dt( c );
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
: is_rcc_crc_ccr_ccc );
const dim_t m = bli_obj_length( c );
const dim_t n = m;
const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
const bool auto_factor = bli_rntm_auto_factor( rntm );
const dim_t n_threads = bli_rntm_num_threads( rntm );
bool use_bp = TRUE;
dim_t jc_new;
dim_t ic_new;
if ( is_primary )
{
// This branch handles:
// - rrr rrc rcr crr for row-preferential kernels
// - rcc crc ccr ccc for column-preferential kernels
const dim_t mu = m / MR;
const dim_t nu = n / NR;
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m primary\n" );
#endif
// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
#if 0
bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n primary\n" );
#endif
// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
#if 0
bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
// *requires nudging of nc up to be a multiple of mr.
}
}
else
{
// This branch handles:
// - rrr rrc rcr crr for column-preferential kernels
// - rcc crc ccr ccc for row-preferential kernels
const dim_t mu = n / MR; // the n becomes m after a transposition
const dim_t nu = m / NR; // the m becomes n after a transposition
// Decide which algorithm to use (block-panel var2m or panel-block
// var1n) based on the number of micropanels in the m and n dimensions.
// Also, recalculate the automatic thread factorization.
if ( mu >= nu ) use_bp = TRUE;
else /* if ( mu < nu ) */ use_bp = FALSE;
// If the parallel thread factorization was automatic, we update it
// with a new factorization based on the matrix dimensions in units
// of micropanels.
if ( auto_factor )
{
if ( use_bp )
{
// In the block-panel algorithm, the m dimension is parallelized
// with ic_nt and the n dimension is parallelized with jc_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new );
}
else // if ( !use_bp )
{
// In the panel-block algorithm, the m dimension is parallelized
// with jc_nt and the n dimension is parallelized with ic_nt.
bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new );
}
// Update the ways of parallelism for the jc and ic loops, and then
// update the current thread's root thrinfo_t node according to the
// new ways of parallelism value for the jc loop.
bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
bli_l3_sup_thrinfo_update_root( rntm, thread );
}
if ( use_bp )
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var2m non-primary\n" );
#endif
// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
#if 0
bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
}
else // use_pb
{
#ifdef TRACEVAR
if ( bli_thread_am_ochief( thread ) )
printf( "bli_l3_sup_int(): var1n non-primary\n" );
#endif
// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
#if 0
bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE,
alpha, a, b, beta, c,
stor_id, cntx, rntm, thread );
#endif
// *requires nudging of mc up to be a multiple of nr.
}
}
// Return success so that the caller knows that we computed the solution.
return BLIS_SUCCESS;
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_int.h 0000664 0000000 0000000 00000004037 14634250137 0022145 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
err_t bli_gemmsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
err_t bli_gemmtsup_int
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_ker.h 0000664 0000000 0000000 00000004503 14634250137 0022132 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-3 kernels on small/unpacked matrices.
//
// Note: Instead of defining function prototype macro templates and then
// instantiating those macros to define the individual function prototypes,
// we simply alias the official operations' prototypes as defined in
// bli_l3_ker_prot.h.
#undef GENTPROT
#define GENTPROT GEMMSUP_KER_PROT
INSERT_GENTPROT_BASIC0( gemmsup_rv_ukr_name )
INSERT_GENTPROT_BASIC0( gemmsup_rg_ukr_name )
INSERT_GENTPROT_BASIC0( gemmsup_cv_ukr_name )
INSERT_GENTPROT_BASIC0( gemmsup_cg_ukr_name )
INSERT_GENTPROT_BASIC0( gemmsup_rd_ukr_name )
INSERT_GENTPROT_BASIC0( gemmsup_cd_ukr_name )
INSERT_GENTPROT_BASIC0( gemmsup_gx_ukr_name )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_ker_prot.h 0000664 0000000 0000000 00000004425 14634250137 0023201 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-3 kernels on small/unpacked matrices.
//
#define GEMMSUP_KER_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_oft.h 0000664 0000000 0000000 00000004041 14634250137 0022136 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019-20, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_SUP_OFT_H
#define BLIS_L3_SUP_OFT_H
//
// -- Level-3 small/unpacked object function types -----------------------------
//
// gemm
#undef GENTDEF
#define GENTDEF( opname ) \
\
typedef err_t (*PASTECH(opname,_oft)) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm \
);
GENTDEF( gemmsup )
GENTDEF( gemmtsup )
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_packm_a.c 0000664 0000000 0000000 00000030516 14634250137 0022742 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Inspect whether we are going to be packing matrix A. */ \
if ( will_pack == FALSE ) \
{ \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
/* NOTE: This "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
the other micropanels. Why? So that millikernels can use the same
upanel ldim for all iterations of the ir loop. */ \
const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
const dim_t k_pack = k; \
\
/* Barrier to make sure all threads are caught up and ready to begin
the packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the memory broker. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was
passed in. It needs to be that mem_t struct, and not a
local (temporary) mem_t, since there is no barrier until
after packing is finished, which could allow a race
condition whereby the chief thread exits the current
function before the other threads have a chance to copy
from it. (A barrier would fix that race condition, but
then again, I prefer to keep barriers to a minimum.) */ \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the memory
broker and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_pba_release \
( \
rntm, \
mem \
); \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Inspect whether we previously packed matrix A. */ \
if ( did_pack == FALSE ) \
{ \
/* If we didn't pack matrix A, there's nothing to be done. */ \
} \
else /* if ( did_pack == TRUE ) */ \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_pba_release \
( \
rntm, \
mem \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Inspect whether we are going to be packing matrix A. */ \
if ( will_pack == FALSE ) \
{ \
*m_max = m; \
*k_max = k; \
\
/* Set the parameters for use with no packing of A (ie: using the
source matrix A directly). */ \
{ \
/* Use the strides of the source matrix as the final values. */ \
*rs_p = rs_x; \
*cs_p = cs_x; \
\
*pd_p = mr; \
*ps_p = mr * rs_x; \
\
/* Set the schema to "not packed" to indicate that packing will be
skipped. */ \
*schema = BLIS_NOT_PACKED; \
} \
\
/* Since we won't be packing, simply update the buffer address provided
by the caller to point to source matrix. */ \
*p = x; \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
/* NOTE: This is "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
the other micropanels. Why? So that millikernels can use the same
upanel ldim for all iterations of the ir loop. */ \
*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
*k_max = k; \
\
/* Determine the dimensions and strides for the packed matrix A. */ \
if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) \
{ \
/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \
*rs_p = k; \
*cs_p = 1; \
\
*pd_p = mr; \
*ps_p = mr * k; \
\
/* Set the schema to "row packed" to indicate packing to plain
row storage. */ \
*schema = BLIS_PACKED_ROWS; \
} \
else \
{ \
/* All other stor3_t ids: pack A to column-stored row-panels. */ \
*rs_p = 1; \
*cs_p = mr; \
\
*pd_p = mr; \
*ps_p = mr * k; \
\
/* Set the schema to "packed row panels" to indicate packing to
conventional column-stored row panels. */ \
*schema = BLIS_PACKED_ROW_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the
memory associated with the mem_t entry acquired from the memory
broker. */ \
*p = bli_mem_buffer( mem ); \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t m_max; \
dim_t k_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. If packing is not requested,
this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_a) \
( \
will_pack, \
pack_buf_type, \
m_alloc, k_alloc, mr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix A. If A
will not be packed, then a_use will be set to point to a and the _a_use
strides will be set accordingly. */ \
PASTEMAC(ch,packm_sup_init_a) \
( \
will_pack, \
stor_id, \
&schema, \
m, k, mr, \
&m_max, &k_max, \
a, rs_a, cs_a, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
cntx, \
mem, \
thread \
); \
\
/* Inspect whether we are going to be packing matrix A. */ \
if ( will_pack == FALSE ) \
{ \
/* If we aren't going to pack matrix A, then there's nothing to do. */ \
\
/*
printf( "blis_ packm_sup_a: not packing A.\n" ); \
*/ \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
if ( schema == BLIS_PACKED_ROWS ) \
{ \
/*
printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
*/ \
\
/* For plain packing by rows, use var2. */ \
PASTEMAC(ch,packm_sup_var2) \
( \
transc, \
schema, \
m, \
k, \
kappa, \
a, rs_a, cs_a, \
*p, *rs_p, *cs_p, \
cntx, \
thread \
); \
} \
else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \
{ \
/*
printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
*/ \
\
/* For packing to column-stored row panels, use var1. */ \
PASTEMAC(ch,packm_sup_var1) \
( \
transc, \
schema, \
m, \
k, \
m_max, \
k_max, \
kappa, \
a, rs_a, cs_a, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
} \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_a )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_packm_a.h 0000664 0000000 0000000 00000007705 14634250137 0022753 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
dim_t m, \
dim_t k, \
dim_t mr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t m, \
dim_t k, \
dim_t mr, \
dim_t* restrict m_max, \
dim_t* restrict k_max, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_a )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t m_alloc, \
dim_t k_alloc, \
dim_t m, \
dim_t k, \
dim_t mr, \
ctype* restrict kappa, \
ctype* restrict a, inc_t rs_a, inc_t cs_a, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_a )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_packm_b.c 0000664 0000000 0000000 00000030543 14634250137 0022743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Inspect whether we are going to be packing matrix B. */ \
if ( will_pack == FALSE ) \
{ \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
/* NOTE: This "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
the other micropanels. Why? So that millikernels can use the same
upanel ldim for all iterations of the ir loop. */ \
const dim_t k_pack = k; \
const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Barrier to make sure all threads are caught up and ready to begin
the packm stage. */ \
bli_thread_barrier( thread ); \
\
/* Compute the size of the memory block eneded. */ \
siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
\
/* Check the mem_t entry provided by the caller. If it is unallocated,
then we need to acquire a block from the memory broker. */ \
if ( bli_mem_is_unalloc( mem ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Acquire directly to the chief thread's mem_t that was
passed in. It needs to be that mem_t struct, and not a
local (temporary) mem_t, since there is no barrier until
after packing is finished, which could allow a race
condition whereby the chief thread exits the current
function before the other threads have a chance to copy
from it. (A barrier would fix that race condition, but
then again, I prefer to keep barriers to a minimum.) */ \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else /* if ( bli_mem_is_alloc( mem ) ) */ \
{ \
/* If the mem_t entry provided by the caller does NOT contain a NULL
buffer, then a block has already been acquired from the memory
broker and cached by the caller. */ \
\
/* As a sanity check, we should make sure that the mem_t object isn't
associated with a block that is too small compared to the size of
the packed matrix buffer that is needed, according to the value
computed above. */ \
siz_t mem_size = bli_mem_size( mem ); \
\
if ( mem_size < size_needed ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* The chief thread releases the existing block associated
with the mem_t, and then re-acquires a new block, saving
the associated mem_t to its passed-in mem_t. (See coment
above for why the acquisition needs to be directly to
the chief thread's passed-in mem_t and not a local
(temporary) mem_t. */ \
bli_pba_release \
( \
rntm, \
mem \
); \
bli_pba_acquire_m \
( \
rntm, \
size_needed, \
pack_buf_type, \
mem \
); \
} \
\
/* Broadcast the address of the chief thread's passed-in mem_t
to all threads. */ \
mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
\
/* Non-chief threads: Copy the contents of the chief thread's
passed-in mem_t to the passed-in mem_t for this thread. (The
chief thread already has the mem_t, so it does not need to
perform any copy.) */ \
if ( !bli_thread_am_ochief( thread ) ) \
{ \
*mem = *mem_p; \
} \
} \
else \
{ \
/* If the mem_t entry is already allocated and sufficiently large,
then we use it as-is. No action is needed. */ \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Inspect whether we previously packed matrix A. */ \
if ( did_pack == FALSE ) \
{ \
/* If we didn't pack matrix A, there's nothing to be done. */ \
} \
else /* if ( did_pack == TRUE ) */ \
{ \
if ( thread != NULL ) \
if ( bli_thread_am_ochief( thread ) ) \
{ \
/* Check the mem_t entry provided by the caller. Only proceed if it
is allocated, which it should be. */ \
if ( bli_mem_is_alloc( mem ) ) \
{ \
bli_pba_release \
( \
rntm, \
mem \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype* x, inc_t rs_x, inc_t cs_x, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
/* Inspect whether we are going to be packing matrix B. */ \
if ( will_pack == FALSE ) \
{ \
*k_max = k; \
*n_max = n; \
\
/* Set the parameters for use with no packing of B (ie: using the
source matrix B directly). */ \
{ \
/* Use the strides of the source matrix as the final values. */ \
*rs_p = rs_x; \
*cs_p = cs_x; \
\
*pd_p = nr; \
*ps_p = nr * cs_x; \
\
/* Set the schema to "not packed" to indicate that packing will be
skipped. */ \
*schema = BLIS_NOT_PACKED; \
} \
\
/* Since we won't be packing, simply update the buffer address provided
by the caller to point to source matrix. */ \
*p = x; \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
/* NOTE: This is "rounding up" of the last upanel is actually optional
for the rrc/crc cases, but absolutely necessary for the other cases
since we NEED that last micropanel to have the same ldim (cs_p) as
the other micropanels. Why? So that millikernels can use the same
upanel ldim for all iterations of the ir loop. */ \
*k_max = k; \
*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
\
/* Determine the dimensions and strides for the packed matrix B. */ \
if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) \
{ \
/* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \
*rs_p = 1; \
*cs_p = k; \
\
*pd_p = nr; \
*ps_p = k * nr; \
\
/* Set the schema to "column packed" to indicate packing to plain
column storage. */ \
*schema = BLIS_PACKED_COLUMNS; \
} \
else \
{ \
/* All other stor3_t ids: pack B to row-stored column-panels. */ \
*rs_p = nr; \
*cs_p = 1; \
\
*pd_p = nr; \
*ps_p = k * nr; \
\
/* Set the schema to "packed column panels" to indicate packing to
conventional row-stored column panels. */ \
*schema = BLIS_PACKED_COL_PANELS; \
} \
\
/* Set the buffer address provided by the caller to point to the
memory associated with the mem_t entry acquired from the memory
broker. */ \
*p = bli_mem_buffer( mem ); \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
//
// Define BLAS-like interfaces to the variant chooser.
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
) \
{ \
pack_t schema; \
dim_t k_max; \
dim_t n_max; \
dim_t pd_p; \
\
/* Prepare the packing destination buffer. If packing is not requested,
this function will reduce to a no-op. */ \
PASTEMAC(ch,packm_sup_init_mem_b) \
( \
will_pack, \
pack_buf_type, \
k_alloc, n_alloc, nr, \
cntx, \
rntm, \
mem, \
thread \
); \
\
/* Determine the packing buffer and related parameters for matrix B. If B
will not be packed, then b_use will be set to point to b and the _b_use
strides will be set accordingly. */ \
PASTEMAC(ch,packm_sup_init_b) \
( \
will_pack, \
stor_id, \
&schema, \
k, n, nr, \
&k_max, &n_max, \
b, rs_b, cs_b, \
p, rs_p, cs_p, \
&pd_p, ps_p, \
cntx, \
mem, \
thread \
); \
\
/* Inspect whether we are going to be packing matrix B. */ \
if ( will_pack == FALSE ) \
{ \
/* If we aren't going to pack matrix B, then there's nothing to do. */ \
\
/*
printf( "blis_ packm_sup_b: not packing B.\n" ); \
*/ \
} \
else /* if ( will_pack == TRUE ) */ \
{ \
if ( schema == BLIS_PACKED_COLUMNS ) \
{ \
/*
printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
*/ \
\
/* For plain packing by columns, use var2. */ \
PASTEMAC(ch,packm_sup_var2) \
( \
transc, \
schema, \
k, \
n, \
kappa, \
b, rs_b, cs_b, \
*p, *rs_p, *cs_p, \
cntx, \
thread \
); \
} \
else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \
{ \
/*
printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
*/ \
\
/* For packing to row-stored column panels, use var1. */ \
PASTEMAC(ch,packm_sup_var1) \
( \
transc, \
schema, \
k, \
n, \
k_max, \
n_max, \
kappa, \
b, rs_b, cs_b, \
*p, *rs_p, *cs_p, \
pd_p, *ps_p, \
cntx, \
thread \
); \
} \
\
/* Barrier so that packing is done before computation. */ \
bli_thread_barrier( thread ); \
} \
}
INSERT_GENTFUNC_BASIC0( packm_sup_b )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_packm_b.h 0000664 0000000 0000000 00000007705 14634250137 0022754 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
dim_t k, \
dim_t n, \
dim_t nr, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool did_pack, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
stor3_t stor_id, \
pack_t* restrict schema, \
dim_t k, \
dim_t n, \
dim_t nr, \
dim_t* restrict k_max, \
dim_t* restrict n_max, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
dim_t* restrict pd_p, inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_init_b )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
bool will_pack, \
packbuf_t pack_buf_type, \
stor3_t stor_id, \
trans_t transc, \
dim_t k_alloc, \
dim_t n_alloc, \
dim_t k, \
dim_t n, \
dim_t nr, \
ctype* restrict kappa, \
ctype* restrict b, inc_t rs_b, inc_t cs_b, \
ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
inc_t* restrict ps_p, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
mem_t* restrict mem, \
thrinfo_t* restrict thread \
); \
INSERT_GENTPROT_BASIC0( packm_sup_b )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_packm_var.c 0000664 0000000 0000000 00000035626 14634250137 0023321 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces to the variants.
//
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it, ic; \
dim_t ic0; \
doff_t ic_inc; \
dim_t panel_len_full; \
dim_t panel_len_i; \
dim_t panel_len_max; \
dim_t panel_len_max_i; \
dim_t panel_dim_i; \
dim_t panel_dim_max; \
inc_t vs_c; \
inc_t ldc; \
inc_t ldp, p_inc; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool row_stored = bli_is_col_packed( schema ); \
/*bool col_stored = bli_is_row_packed( schema );*/ \
\
/* If the row storage flag indicates row storage, then we are packing
to column panels; otherwise, if the strides indicate column storage,
we are packing to row panels. */ \
if ( row_stored ) \
{ \
/* Prepare to pack to row-stored column panels. */ \
iter_dim = n; \
panel_len_full = m; \
panel_len_max = m_max; \
panel_dim_max = pd_p; \
vs_c = cs_c; \
ldc = rs_c; \
ldp = rs_p; \
} \
else /* if ( col_stored ) */ \
{ \
/* Prepare to pack to column-stored row panels. */ \
iter_dim = m; \
panel_len_full = n; \
panel_len_max = n_max; \
panel_dim_max = pd_p; \
vs_c = rs_c; \
ldc = cs_c; \
ldp = cs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
\
/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */ \
{ \
ic0 = 0; \
ic_inc = panel_dim_max; \
} \
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( ic = ic0, it = 0; it < n_iter; \
ic += ic_inc, it += 1 ) \
{ \
panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
\
ctype* restrict c_begin = c_cast + (ic )*vs_c; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
{ \
panel_len_i = panel_len_full; \
panel_len_max_i = panel_len_max; \
\
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC(ch,packm_cxk) \
( \
conjc, \
schema, \
panel_dim_i, \
panel_dim_max, \
panel_len_i, \
panel_len_max_i, \
kappa_cast, \
c_use, vs_c, ldc, \
p_use, ldp, \
cntx \
); \
} \
\
/* NOTE: This value is equivalent to ps_p. */ \
p_inc = ps_p; \
} \
\
p_begin += p_inc; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
} \
\
}
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var1 )
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
if ( col_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \
c_cast, rs_c, cs_c, "%4.1f", "" ); \
*/
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
else \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
\
/*
if ( col_stored ) { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
} \
else { \
if ( bli_thread_work_id( thread ) == 0 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
if ( bli_thread_work_id( thread ) == 1 ) \
{ \
printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \
fflush( stdout ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \
( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \
( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \
fflush( stdout ); \
} \
bli_thread_barrier( thread ); \
} \
*/
/*
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
*/
/*
if ( row_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
inc_t is_b = rs_p * *m_panel_max; \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
/*
if ( col_stored ) { \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \
( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \
(( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \
( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \
} \
*/
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
) \
{ \
ctype* restrict kappa_cast = kappa; \
ctype* restrict c_cast = c; \
ctype* restrict p_cast = p; \
\
dim_t iter_dim; \
dim_t n_iter; \
dim_t it; \
dim_t vector_len; \
inc_t incc, ldc; \
inc_t incp, ldp; \
conj_t conjc; \
\
\
/* Extract the conjugation bit from the transposition argument. */ \
conjc = bli_extract_conj( transc ); \
\
/* If c needs a transposition, induce it so that we can more simply
express the remaining parameters and code. */ \
if ( bli_does_trans( transc ) ) \
{ \
bli_swap_incs( &rs_c, &cs_c ); \
bli_toggle_trans( &transc ); \
} \
\
/* Create flags to incidate row or column storage. Note that the
schema bit that encodes row or column is describing the form of
micro-panel, not the storage in the micro-panel. Hence the
mismatch in "row" and "column" semantics. */ \
bool col_stored = bli_is_col_packed( schema ); \
/*bool row_stored = bli_is_row_packed( schema );*/ \
\
if ( col_stored ) \
{ \
/* Prepare to pack to a column-stored matrix. */ \
iter_dim = n; \
vector_len = m; \
incc = rs_c; \
ldc = cs_c; \
incp = 1; \
ldp = cs_p; \
} \
else /* if ( row_stored ) */ \
{ \
/* Prepare to pack to a row-stored matrix. */ \
iter_dim = m; \
vector_len = n; \
incc = cs_c; \
ldc = rs_c; \
incp = 1; \
ldp = rs_p; \
} \
\
/* Compute the total number of iterations we'll need. */ \
n_iter = iter_dim; \
\
\
ctype* restrict p_begin = p_cast; \
\
/* Query the number of threads and thread ids from the current thread's
packm thrinfo_t node. */ \
const dim_t nt = bli_thread_n_way( thread ); \
const dim_t tid = bli_thread_work_id( thread ); \
\
/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
( void )nt; \
( void )tid; \
\
dim_t it_start, it_end, it_inc; \
\
/* Determine the thread range and increment using the current thread's
packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
will depend on whether slab or round-robin partitioning was requested
at configure-time. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
\
/* Iterate over every logical micropanel in the source matrix. */ \
for ( it = 0; it < n_iter; it += 1 ) \
{ \
ctype* restrict c_begin = c_cast + (it )*ldc; \
\
ctype* restrict c_use = c_begin; \
ctype* restrict p_use = p_begin; \
\
{ \
/* The definition of bli_packm_my_iter() will depend on whether slab
or round-robin partitioning was requested at configure-time. */ \
if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
{ \
PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \
( \
conjc, \
vector_len, \
kappa_cast, \
c_use, incc, \
p_use, incp, \
cntx, \
NULL \
); \
} \
\
} \
\
p_begin += ldp; \
\
/*
if ( row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
if ( !row_stored ) \
PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \
p_use, rs_p, cs_p, "%5.2f", "" ); \
*/ \
} \
}
INSERT_GENTFUNCR_BASIC( packm, packm_sup_var2 )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_packm_var.h 0000664 0000000 0000000 00000005462 14634250137 0023321 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces to the variants.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
dim_t m_max, \
dim_t n_max, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
dim_t pd_p, inc_t ps_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( packm_sup_var1 )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
trans_t transc, \
pack_t schema, \
dim_t m, \
dim_t n, \
ctype* restrict kappa, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
ctype* restrict p, inc_t rs_p, inc_t cs_p, \
cntx_t* restrict cntx, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( packm_sup_var2 )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_ref.c 0000664 0000000 0000000 00000012610 14634250137 0022116 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
err_t bli_gemmsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// This function implements the default gemmsup handler. If you are a
// BLIS developer and wish to use a different gemmsup handler, please
// register a different function pointer in the context in your
// sub-configuration's bli_cntx_init_*() function.
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemm_check( alpha, a, b, beta, c, cntx );
#if 0
// NOTE: This special case handling is done within the variants.
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return BLIS_SUCCESS;
}
#endif
const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
// Don't use the small/unpacked implementation if one of the matrices
// uses general stride. NOTE: We check for this here, in bli_gemmsup_ref()
// (and not in the calling function, bli_gemmsup()), because we consider
// this way of handling general stride to be part of the implementation
// and not necessarily a general-purpose solution that would apply to all
// possible gemmsup handlers. Similarly, we check for it here (and not in
// the internal thread entry point, bli_gemmsup_int()) because we don't
// want to have to manage the multiple return values from the threads,
// which we would have to process into a single return value and then
// return from the parallel/threaded region.
if ( stor_id == BLIS_XXX ) return BLIS_FAILURE;
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop.
bli_rntm_set_ways_from_rntm_sup
(
bli_obj_length( c ),
bli_obj_width( c ),
bli_obj_width( a ),
rntm
);
#if 0
printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) );
printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) );
//bli_rntm_set_pack_a( 0, rntm );
//bli_rntm_set_pack_b( 0, rntm );
#endif
return
bli_l3_sup_thread_decorator
(
bli_gemmsup_int,
BLIS_GEMM, // operation family id
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}
// -----------------------------------------------------------------------------
err_t bli_gemmtsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
)
{
// This function implements the default gemmtsup handler. If you are a
// BLIS developer and wish to use a different gemmtsup handler, please
// register a different function pointer in the context in your
// sub-configuration's bli_cntx_init_*() function.
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_gemmt_check( alpha, a, b, beta, c, cntx );
#if 0
// NOTE: This special case handling is done within the variants.
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// If A or B has a zero dimension, scale C by beta and return early.
if ( bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return BLIS_SUCCESS;
}
#endif
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop.
bli_rntm_set_ways_from_rntm_sup
(
bli_obj_length( c ),
bli_obj_width( c ),
bli_obj_width( a ),
rntm
);
return
bli_l3_sup_thread_decorator
(
bli_gemmtsup_int,
BLIS_GEMMT, // operation family id
alpha,
a,
b,
beta,
c,
cntx,
rntm
);
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_ref.h 0000664 0000000 0000000 00000003754 14634250137 0022134 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
err_t bli_gemmsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
err_t bli_gemmtsup_ref
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_var12.c 0000664 0000000 0000000 00000053600 14634250137 0022301 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmsup_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
stor3_t eff_id,
cntx_t* restrict cntx,
rntm_t* restrict rntm
);
#if 0
//
// -- var2 ---------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var2,gemmsup_ref_var2);
void bli_gemmsup_ref_var2
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#else
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var2[dt_exec];
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm
);
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t eff_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm \
) \
{ \
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* If alpha is zero, scale by beta and return. */ \
if ( PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c * NC; \
const inc_t jcstep_b = cs_b * NC; \
\
const inc_t pcstep_a = cs_a * KC; \
const inc_t pcstep_b = rs_b * KC; \
\
const inc_t icstep_c = rs_c * MC; \
const inc_t icstep_a = rs_a * MC; \
\
const inc_t jrstep_c = cs_c * NR; \
const inc_t jrstep_b = cs_b * NR; \
\
const inc_t irstep_c = rs_c * MR; \
const inc_t irstep_a = rs_a * MR; \
\
/* Query a stor3_t enum value to characterize the problem.
Examples: BLIS_RRR, BLIS_RRC, BLIS_RCR, BLIS_RCC, etc.
NOTE: If any matrix is general-stored, we use the all-purpose sup
microkernel corresponding to the stor3_t enum value BLIS_XXX. */ \
const stor3_t stor_id = bli_stor3_from_strides( rs_c, cs_c, \
rs_a, cs_a, rs_b, cs_b ); \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
ctype* restrict one = PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the outer
dimensions.
NOTE: Functionally speaking, we compute jc_iter as:
jc_iter = n / NC; if ( jc_left ) ++jc_iter;
However, this is implemented as:
jc_iter = ( n + NC - 1 ) / NC;
This avoids a branch at the cost of two additional integer instructions.
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
similar manner. */ \
const dim_t jc_iter = ( n + NC - 1 ) / NC; \
const dim_t jc_left = n % NC; \
\
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
const dim_t pc_left = k % KC; \
\
const dim_t ic_iter = ( m + MC - 1 ) / MC; \
const dim_t ic_left = m % MC; \
\
const dim_t jc_inc = 1; \
const dim_t pc_inc = 1; \
const dim_t ic_inc = 1; \
const dim_t jr_inc = 1; \
const dim_t ir_inc = 1; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
{ \
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
const dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
const dim_t jr_left = nc_cur % NR; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
{ \
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
{ \
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
const dim_t ir_left = mc_cur % MR; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc + j * jrstep_b; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/*
ctype* restrict b2 = b_jr; \
*/ \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \
{ \
const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict a_ir = a_ic + i * irstep_a; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
/*
ctype* restrict a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \
{ \
a2 = a_00; \
b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \
if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \
b2 = b_00; \
} \
\
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
*/ \
\
/* Invoke the gemmsup micro-kernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
mr_cur, \
nr_cur, \
kc_cur, \
alpha_cast, \
a_ir, rs_a, cs_a, \
b_jr, rs_b, cs_b, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2 )
//
// -- var1 ---------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var1,gemmsup_ref_var1);
void bli_gemmsup_ref_var1
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#else
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var1[dt_exec];
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm
);
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t eff_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm \
) \
{ \
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* If alpha is zero, scale by beta and return. */ \
if ( PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
\
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = rs_c * NC; \
const inc_t jcstep_a = rs_a * NC; \
\
const inc_t pcstep_a = cs_a * KC; \
const inc_t pcstep_b = rs_b * KC; \
\
const inc_t icstep_c = cs_c * MC; \
const inc_t icstep_b = cs_b * MC; \
\
const inc_t jrstep_c = rs_c * MR; \
const inc_t jrstep_a = rs_a * MR; \
\
const inc_t irstep_c = cs_c * NR; \
const inc_t irstep_b = cs_b * NR; \
\
/* Query a stor3_t enum value to characterize the problem.
Examples: BLIS_RRR, BLIS_RRC, BLIS_RCR, BLIS_RCC, etc.
NOTE: If any matrix is general-stored, we use the all-purpose sup
microkernel corresponding to the stor3_t enum value BLIS_XXX. */ \
const stor3_t stor_id = bli_stor3_from_strides( rs_c, cs_c, \
rs_a, cs_a, rs_b, cs_b ); \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
ctype* restrict one = PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the outer
dimensions.
NOTE: Functionally speaking, we compute jc_iter as:
jc_iter = m / NC; if ( jc_left ) ++jc_iter;
However, this is implemented as:
jc_iter = ( m + NC - 1 ) / NC;
This avoids a branch at the cost of two additional integer instructions.
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
similar manner. */ \
const dim_t jc_iter = ( m + NC - 1 ) / NC; \
const dim_t jc_left = m % NC; \
\
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
const dim_t pc_left = k % KC; \
\
const dim_t ic_iter = ( n + MC - 1 ) / MC; \
const dim_t ic_left = n % MC; \
\
const dim_t jc_inc = 1; \
const dim_t pc_inc = 1; \
const dim_t ic_inc = 1; \
const dim_t jr_inc = 1; \
const dim_t ir_inc = 1; \
\
/* Loop over the m dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
{ \
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
\
ctype* restrict a_jc = a_00 + jj * jcstep_a; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
const dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
const dim_t jr_left = nc_cur % MR; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
{ \
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
\
ctype* restrict a_pc = a_jc + pp * pcstep_a; \
ctype* restrict b_pc = b_00 + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
\
/* Loop over the n dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
{ \
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
\
ctype* restrict b_ic = b_pc + ii * icstep_b; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
const dim_t ir_left = mc_cur % NR; \
\
/* Loop over the m dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict a_jr = a_pc + j * jrstep_a; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Loop over the n dimension (MR rows at a time). */ \
for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \
{ \
const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
\
ctype* restrict b_ir = b_ic + i * irstep_b; \
ctype* restrict c_ir = c_jr + i * irstep_c; \
\
/* Invoke the gemmsup micro-kernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
mr_cur, \
nr_cur, \
kc_cur, \
alpha_cast, \
a_jr, rs_a, cs_a, \
b_ir, rs_b, cs_b, \
beta_use, \
c_ir, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1 )
#endif
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_var1n2m.c 0000664 0000000 0000000 00000131760 14634250137 0022640 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmsup_fp
typedef void (*FUNCPTR_T)
(
bool packa,
bool packb,
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
stor3_t eff_id,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
thrinfo_t* restrict thread
);
//
// -- var1n --------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
void bli_gemmsup_ref_var1n
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
#else
const num_t dt = bli_obj_dt( c );
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var1n[dt];
#if 1
// Optimize some storage/packing cases by transforming them into others.
// These optimizations are expressed by changing trans and/or eff_id.
bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
#endif
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
packa,
packb,
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
thread
);
}
else
{
// Invoke the function (transposing the operation).
f
(
packb,
packa,
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
m,
k,
buf_alpha,
buf_b, cs_b, rs_b, // swap the positions of A and B.
buf_a, cs_a, rs_a, // swap the strides of A and B.
buf_beta,
buf_c, cs_c, rs_c, // swap the strides of C.
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
cntx,
rntm,
thread
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
} \
return; \
} \
\
/* This transposition of the stor3_t id value is inherent to variant 1.
The reason: we assume that variant 2 is the "main" variant. The
consequence of this is that we assume that the millikernels that
iterate over m are registered to the "primary" kernel group associated
with the kernel IO preference; similarly, mkernels that iterate over
n are assumed to be registered to the "non-primary" group associated
with the ("non-primary") anti-preference. Note that this pattern holds
regardless of whether the mkernel set has a row or column preference.)
See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
stor_id = bli_stor3_trans( stor_id ); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
dim_t KC; \
if ( packa && packb ) \
{ \
KC = KC0; \
} \
else if ( packb ) \
{ \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( stor_id == BLIS_RCR || \
stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \
else KC = KC0; \
} \
else if ( packa ) \
{ \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( stor_id == BLIS_RCR || \
stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \
else KC = KC0; \
} \
else /* if ( !packa && !packb ) */ \
{ \
if ( FALSE ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( m <= MR && n <= NR ) KC = KC0; \
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
} \
\
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
NOTE: This is unique to variant 1 (ie: not performed in variant 2)
because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
\
/* Query the maximum blocksize for MR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
const dim_t MRE = MRM - MR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = rs_c; \
const inc_t jcstep_a = rs_a; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = cs_c; \
const inc_t icstep_b = cs_b; \
\
const inc_t jrstep_c = rs_c * MR; \
\
/*
const inc_t jrstep_a = rs_a * MR; \
\
const inc_t irstep_c = cs_c * NR; \
const inc_t irstep_b = cs_b * NR; \
*/ \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of beta and one scalars to prevent any unnecessary
sharing of cache lines between the cores' caches. */ \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Parse and interpret the contents of the rntm_t object to properly
set the ways of parallelism for each loop. */ \
/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. An alternative way of initializing the
mem_t entries is:
bli_mem_clear( &mem_a ); \
bli_mem_clear( &mem_b ); \
*/ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree.
NOTE: These bszid_t values, and their order, match that of the bp
algorithm (variant 2) because they are not used to query actual
blocksizes but rather query the ways of parallelism for the various
loops. For example, the 2nd loop in variant 1 partitions in the m
dimension (in increments of MR), but parallelizes that m dimension
with BLIS_JR_NT. The only difference is that the _packa and _packb
arrays have been adjusted for the semantic difference in order in
which packa and packb nodes are encountered in the thrinfo tree.
That is, this panel-block algorithm partitions an NC x KC submatrix
of A to be packed in the 4th loop, and a KC x MC submatrix of B
to be packed in the 3rd loop. */ \
/* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t* restrict bszids; \
\
/* Set the bszids pointer to the correct bszids array above based on which
matrices (if any) are being packed. */ \
if ( packa ) { if ( packb ) bszids = bszids_packab; \
else bszids = bszids_packa; } \
else { if ( packb ) bszids = bszids_packb; \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jc = bszids; \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
const dim_t m_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = m_local % NC; \
\
/* Loop over the m dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
\
ctype* restrict a_jc = a_00 + jj * jcstep_a; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_pc = &bszids_jc[1]; \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_jc + pp * pcstep_a; \
ctype* restrict b_pc = b_00 + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing A. If we won't be packing A, we alias to
the _pc variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pa; \
if ( packa ) { bszids_pa = &bszids_pc[1]; \
thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
else { bszids_pa = &bszids_pc[0]; \
thread_pa = thread_pc; } \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id.
NOTE: packing matrix A in this panel-block algorithm corresponds
to packing matrix B in the block-panel algorithm. */ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
stor_id, /* a "panel of B". */ \
BLIS_NO_TRANSPOSE, \
NC, KC, /* This "panel of B" is (at most) NC x KC. */ \
nc_cur, kc_cur, MR, \
&one_local, \
a_pc, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_pc_use = a_use; \
\
/* We don't need to embed the panel stride of A within the auxinfo_t
object because this variant iterates through A in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_ic = &bszids_pa[1]; \
thread_ic = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
const dim_t n_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = n_local % MC; \
\
/* Loop over the n dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict b_ic = b_pc + ii * icstep_b; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing A. If we won't be packing A, we alias to
the _pc variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pb; \
if ( packb ) { bszids_pb = &bszids_ic[1]; \
thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
else { bszids_pb = &bszids_ic[0]; \
thread_pb = thread_ic; } \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then b_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id.
NOTE: packing matrix B in this panel-block algorithm corresponds
to packing matrix A in the block-panel algorithm. */ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
stor_id, /* a "block of A". */ \
BLIS_NO_TRANSPOSE, \
KC, MC, /* This "block of A" is (at most) KC x MC. */ \
kc_cur, mc_cur, NR, \
&one_local, \
b_ic, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_ic_use = b_use; \
\
/* Embed the panel stride of B within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of B. */ \
bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jr = &bszids_pb[1]; \
thread_jr = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
dim_t jr_left = nc_cur % MR; \
\
/* An optimization: allow the last jr iteration to contain up to MRE
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. NOTE: We forgo this optimization when packing A
since packing an extended edge case is not yet supported. */ \
if ( !packa && !is_mt ) \
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
{ \
jr_iter--; jr_left += MR; \
} \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the m dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
\
/*
ctype* restrict a_jr = a_pc + j * jrstep_a; \
*/ \
ctype* restrict a_jr = a_pc_use + j * ps_a_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/*
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
const dim_t ir_left = mc_cur % NR; \
*/ \
\
/* Loop over the n dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
nr_cur, /* Notice: nr_cur <= MR. */ \
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
kc_cur, \
alpha_cast, \
a_jr, rs_a_use, cs_a_use, \
b_ic_use, rs_b_use, cs_b_use, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* NOTE: This barrier is only needed if we are packing A (since
that matrix is packed within the pc loop of this variant). */ \
if ( packa ) bli_thread_barrier( thread_pa ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTEMAC(ch,packm_sup_finalize_mem_a) \
( \
packa, \
rntm, \
&mem_a, \
thread_pa \
); \
PASTEMAC(ch,packm_sup_finalize_mem_b) \
( \
packb, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
//
// -- var2m --------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
void bli_gemmsup_ref_var2m
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
#else
const num_t dt = bli_obj_dt( c );
const bool packa = bli_rntm_pack_a( rntm );
const bool packb = bli_rntm_pack_b( rntm );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var2m[dt];
#if 1
// Optimize some storage/packing cases by transforming them into others.
// These optimizations are expressed by changing trans and/or eff_id.
bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
#endif
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
packa,
packb,
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
thread
);
}
else
{
// Invoke the function (transposing the operation).
f
(
packb, // swap the pack values.
packa,
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
m,
k,
buf_alpha,
buf_b, cs_b, rs_b, // swap the positions of A and B.
buf_a, cs_a, rs_a, // swap the strides of A and B.
buf_beta,
buf_c, cs_c, rs_c, // swap the strides of C.
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
cntx,
rntm,
thread
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
if ( bli_thread_am_ochief( thread ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
} \
return; \
} \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
dim_t KC; \
if ( packa && packb ) \
{ \
KC = KC0; \
} \
else if ( packb ) \
{ \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( stor_id == BLIS_RCR || \
stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \
else KC = KC0; \
} \
else if ( packa ) \
{ \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( stor_id == BLIS_RCR || \
stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \
else KC = KC0; \
} \
else /* if ( !packa && !packb ) */ \
{ \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( m <= MR && n <= NR ) KC = KC0; \
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
} \
\
/* Query the maximum blocksize for NR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
const dim_t NRE = NRM - NR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c; \
const inc_t jcstep_b = cs_b; \
\
const inc_t pcstep_a = cs_a; \
const inc_t pcstep_b = rs_b; \
\
const inc_t icstep_c = rs_c; \
const inc_t icstep_a = rs_a; \
\
const inc_t jrstep_c = cs_c * NR; \
\
/*
const inc_t jrstep_b = cs_b * NR; \
( void )jrstep_b; \
\
const inc_t irstep_c = rs_c * MR; \
const inc_t irstep_a = rs_a * MR; \
*/ \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
/* Make local copies of beta and one scalars to prevent any unnecessary
sharing of cache lines between the cores' caches. */ \
ctype beta_local = *beta_cast; \
ctype one_local = *PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Parse and interpret the contents of the rntm_t object to properly
set the ways of parallelism for each loop. */ \
/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
\
/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
needed for the matrix we will be packing (if any), but we do it
unconditionally to be safe. An alternative way of initializing the
mem_t entries is:
bli_mem_clear( &mem_a ); \
bli_mem_clear( &mem_b ); \
*/ \
mem_t mem_a = BLIS_MEM_INITIALIZER; \
mem_t mem_b = BLIS_MEM_INITIALIZER; \
\
/* Define an array of bszid_t ids, which will act as our substitute for
the cntl_t tree. */ \
/* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \
bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
bszid_t* restrict bszids; \
\
/* Set the bszids pointer to the correct bszids array above based on which
matrices (if any) are being packed. */ \
if ( packa ) { if ( packb ) bszids = bszids_packab; \
else bszids = bszids_packa; } \
else { if ( packb ) bszids = bszids_packb; \
else bszids = bszids_nopack; } \
\
/* Determine whether we are using more than one thread. */ \
const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
\
thrinfo_t* restrict thread_jc = NULL; \
thrinfo_t* restrict thread_pc = NULL; \
thrinfo_t* restrict thread_pb = NULL; \
thrinfo_t* restrict thread_ic = NULL; \
thrinfo_t* restrict thread_pa = NULL; \
thrinfo_t* restrict thread_jr = NULL; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jc = bszids; \
thread_jc = thread; \
bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
\
/* Compute the JC loop thread range for the current thread. */ \
dim_t jc_start, jc_end; \
bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
const dim_t n_local = jc_end - jc_start; \
\
/* Compute number of primary and leftover components of the JC loop. */ \
/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
const dim_t jc_left = n_local % NC; \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
{ \
/* Calculate the thread's current JC block dimension. */ \
const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_pc = &bszids_jc[1]; \
thread_pc = bli_thrinfo_sub_node( thread_jc ); \
bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
\
/* Compute the PC loop thread range for the current thread. */ \
const dim_t pc_start = 0, pc_end = k; \
const dim_t k_local = k; \
\
/* Compute number of primary and leftover components of the PC loop. */ \
/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
const dim_t pc_left = k_local % KC; \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
{ \
/* Calculate the thread's current PC block dimension. */ \
const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
\
ctype* b_use; \
inc_t rs_b_use, cs_b_use, ps_b_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing B. If we won't be packing B, we alias to
the _pc variables so that code further down can unconditionally
reference the _pb variables. Note that *if* we will be packing
B, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pb; \
if ( packb ) { bszids_pb = &bszids_pc[1]; \
thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
else { bszids_pb = &bszids_pc[0]; \
thread_pb = thread_pc; } \
\
/* Determine the packing buffer and related parameters for matrix
B. (If B will not be packed, then a_use will be set to point to
b and the _b_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_b) \
( \
packb, \
BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
stor_id, /* a "panel of B." */ \
BLIS_NO_TRANSPOSE, \
KC, NC, /* This "panel of B" is (at most) KC x NC. */ \
kc_cur, nc_cur, NR, \
&one_local, \
b_pc, rs_b, cs_b, \
&b_use, &rs_b_use, &cs_b_use, \
&ps_b_use, \
cntx, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/* Alias b_use so that it's clear this is our current block of
matrix B. */ \
ctype* restrict b_pc_use = b_use; \
\
/* We don't need to embed the panel stride of B within the auxinfo_t
object because this variant iterates through B in the jr loop,
which occurs here, within the macrokernel, not within the
millikernel. */ \
/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_ic = &bszids_pb[1]; \
thread_ic = bli_thrinfo_sub_node( thread_pb ); \
bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
\
/* Compute the IC loop thread range for the current thread. */ \
dim_t ic_start, ic_end; \
bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
const dim_t m_local = ic_end - ic_start; \
\
/* Compute number of primary and leftover components of the IC loop. */ \
/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
const dim_t ic_left = m_local % MC; \
\
/* Loop over the m dimension (MC rows at a time). */ \
/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
{ \
/* Calculate the thread's current IC block dimension. */ \
const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
ctype* a_use; \
inc_t rs_a_use, cs_a_use, ps_a_use; \
\
/* Set the bszid_t array and thrinfo_t pointer based on whether
we will be packing B. If we won't be packing A, we alias to
the _ic variables so that code further down can unconditionally
reference the _pa variables. Note that *if* we will be packing
A, the thrinfo_t node will have already been created by a
previous call to bli_thrinfo_grow(), since bszid values of
BLIS_NO_PART cause the tree to grow by two (e.g. to the next
bszid that is a normal bszid_t value). */ \
bszid_t* restrict bszids_pa; \
if ( packa ) { bszids_pa = &bszids_ic[1]; \
thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
else { bszids_pa = &bszids_ic[0]; \
thread_pa = thread_ic; } \
\
/* Determine the packing buffer and related parameters for matrix
A. (If A will not be packed, then a_use will be set to point to
a and the _a_use strides will be set accordingly.) Then call
the packm sup variant chooser, which will call the appropriate
implementation based on the schema deduced from the stor_id. */ \
PASTEMAC(ch,packm_sup_a) \
( \
packa, \
BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
stor_id, /* a "block of A." */ \
BLIS_NO_TRANSPOSE, \
MC, KC, /* This "block of A" is (at most) MC x KC. */ \
mc_cur, kc_cur, MR, \
&one_local, \
a_ic, rs_a, cs_a, \
&a_use, &rs_a_use, &cs_a_use, \
&ps_a_use, \
cntx, \
rntm, \
&mem_a, \
thread_pa \
); \
\
/* Alias a_use so that it's clear this is our current block of
matrix A. */ \
ctype* restrict a_ic_use = a_use; \
\
/* Embed the panel stride of A within the auxinfo_t object. The
millikernel will query and use this to iterate through
micropanels of A (if needed). */ \
bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
\
/* Grow the thrinfo_t tree. */ \
bszid_t* restrict bszids_jr = &bszids_pa[1]; \
thread_jr = bli_thrinfo_sub_node( thread_pa ); \
bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
\
/* Compute number of primary and leftover components of the JR loop. */ \
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* An optimization: allow the last jr iteration to contain up to NRE
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. NOTE: We forgo this optimization when packing B
since packing an extended edge case is not yet supported. */ \
if ( !packb && !is_mt ) \
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
{ \
jr_iter--; jr_left += NR; \
} \
\
/* Compute the JR loop thread range for the current thread. */ \
dim_t jr_start, jr_end; \
bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
/*
ctype* restrict b_jr = b_pc_use + j * jrstep_b; \
*/ \
ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/*
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
const dim_t ir_left = mc_cur % MR; \
*/ \
\
/* Loop over the m dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
mc_cur, \
nr_cur, \
kc_cur, \
alpha_cast, \
a_ic_use, rs_a_use, cs_a_use, \
b_jr, rs_b_use, cs_b_use, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* NOTE: This barrier is only needed if we are packing B (since
that matrix is packed within the pc loop of this variant). */ \
if ( packb ) bli_thread_barrier( thread_pb ); \
} \
} \
\
/* Release any memory that was acquired for packing matrices A and B. */ \
PASTEMAC(ch,packm_sup_finalize_mem_a) \
( \
packa, \
rntm, \
&mem_a, \
thread_pa \
); \
PASTEMAC(ch,packm_sup_finalize_mem_b) \
( \
packb, \
rntm, \
&mem_b, \
thread_pb \
); \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_sup_vars.h 0000664 0000000 0000000 00000014214 14634250137 0022324 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
trans_t trans, \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
stor3_t eff_id, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
);
GENPROT( gemmsup_ref_var1 )
GENPROT( gemmsup_ref_var2 )
GENPROT( gemmsup_ref_var1n )
GENPROT( gemmsup_ref_var2m )
//
// Prototype BLAS-like interfaces with void pointer operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t eff_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
bool packa, \
bool packb, \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t eff_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
thrinfo_t* restrict thread \
);
INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
// -----------------------------------------------------------------------------
BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
(
num_t dt,
trans_t* trans,
bool packa,
bool packb,
stor3_t* eff_id,
cntx_t* cntx
)
{
const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx );
// Handle row- and column-preferrential kernels separately.
if ( row_pref )
{
if ( packa && packb )
{
if ( *eff_id == BLIS_RRC )
{
// Since C is already row-stored, we can use BLIS_RRR kernel instead.
*eff_id = BLIS_RRR;
}
else if ( *eff_id == BLIS_CRC )
{
// BLIS_RRC when transposed below (both matrices still packed).
// This allows us to use the BLIS_RRR kernel instead.
*eff_id = BLIS_CCC; // BLIS_RRR when transposed below.
}
else if ( *eff_id == BLIS_CRR )
{
// Induce a transpose to make C row-stored.
// BLIS_RCC when transposed below (both matrices still packed).
// This allows us to use the BLIS_RRR kernel instead.
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_CCC; // BLIS_RRR when transposed below.
}
}
else if ( packb )
{
if ( *eff_id == BLIS_RRC )
{
// Since C is already row-stored, we can use BLIS_RRR kernel instead.
*eff_id = BLIS_RRR;
}
else if ( *eff_id == BLIS_CRC )
{
// BLIS_RRC when transposed below (with packa instead of packb).
// No transformation is beneficial here.
}
else if ( *eff_id == BLIS_RCC )
{
// C is already row-stored; cancel transposition and use BLIS_RCR
// kernel instead.
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_RCR;
}
#if 0
// This transformation performs poorly. Theory: packing A (formerly B)
// when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow
// and kills the performance?
else if ( eff_id == BLIS_CRR )
{
trans = bli_trans_toggled( trans );
eff_id = BLIS_CRC; // BLIS_RRC when transposed below.
}
#endif
}
else if ( packa )
{
if ( *eff_id == BLIS_CRR )
{
// Induce a transpose to make C row-stored.
// BLIS_RCC when transposed below (both matrices still packed).
// This allows us to use the BLIS_RRR kernel instead.
*trans = bli_trans_toggled( *trans );
*eff_id = BLIS_CCR; // BLIS_RCR when transposed below.
}
}
}
else
{
//bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" );
bli_abort();
}
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_tapi.c 0000664 0000000 0000000 00000020170 14634250137 0021410 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2021, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces with typed operands (basic).
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
transa, \
transb, \
m, n, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( gemm )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uploc, \
transa, \
transb, \
m, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( gemmt )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, struca ) \
\
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side, \
uploa, \
conja, \
transb, \
m, n, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN )
INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uploc, \
transa, \
m, k, \
alpha, \
a, rs_a, cs_a, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNCR_BASIC0( herk )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uploc, \
transa, \
transb, \
m, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNCR_BASIC0( her2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uploc, \
transa, \
m, k, \
alpha, \
a, rs_a, cs_a, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( syrk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uploc, \
transa, \
transb, \
m, k, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( syr2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side, \
uploa, \
transa, \
diaga, \
transb, \
m, n, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
beta, \
c, rs_c, cs_c, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( trmm3 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b \
) \
{ \
/* Invoke the expert interface and request default cntx_t and rntm_t
objects. */ \
PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side, \
uploa, \
transa, \
diaga, \
m, n, \
alpha, \
a, rs_a, cs_a, \
b, rs_b, cs_b, \
NULL, \
NULL \
); \
}
INSERT_GENTFUNC_BASIC0( trmm )
INSERT_GENTFUNC_BASIC0( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_tapi.h 0000664 0000000 0000000 00000012431 14634250137 0021416 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands (basic).
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC0( gemm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC0( hemm )
INSERT_GENTPROT_BASIC0( symm )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROTR_BASIC0( herk )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROTR_BASIC0( her2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC0( syrk )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC0( gemmt )
INSERT_GENTPROT_BASIC0( syr2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c \
);
INSERT_GENTPROT_BASIC0( trmm3 )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT_BASIC0( trmm )
INSERT_GENTPROT_BASIC0( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_tapi_ex.c 0000664 0000000 0000000 00000035412 14634250137 0022111 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-like interfaces with typed operands (expert).
//
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( gemm )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, struca ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
dim_t m_b, n_b; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN )
INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
\
bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNCR_BASIC0( herk )
#undef GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_HERMITIAN, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNCR_BASIC0( her2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( syrk )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( syr2k )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m_a, n_a; \
dim_t m_b, n_b; \
\
bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploc, &co ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( gemmt )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
dim_t m_b, n_b; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, beta, &betao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( trmm3 )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
cntx_t* cntx, \
rntm_t* rntm \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn_a; \
\
bli_set_dim_with_side( side, m, n, &mn_a ); \
\
bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
\
bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m, n, b, rs_b, cs_b, &bo ); \
\
bli_obj_set_uplo( uploa, &ao ); \
bli_obj_set_diag( diaga, &ao ); \
bli_obj_set_conjtrans( transa, &ao ); \
\
bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \
\
PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
( \
side, \
&alphao, \
&ao, \
&bo, \
cntx, \
rntm \
); \
}
INSERT_GENTFUNC_BASIC0( trmm )
INSERT_GENTFUNC_BASIC0( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_tapi_ex.h 0000664 0000000 0000000 00000013436 14634250137 0022120 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-like interfaces with typed operands (expert).
//
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( gemm )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( hemm )
INSERT_GENTPROT_BASIC0( symm )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROTR_BASIC0( herk )
#undef GENTPROTR
#define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROTR_BASIC0( her2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( syrk )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( gemmt )
INSERT_GENTPROT_BASIC0( syr2k )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( trmm3 )
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTPROT_BASIC0( trmm )
INSERT_GENTPROT_BASIC0( trsm )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_thrinfo.c 0000664 0000000 0000000 00000054621 14634250137 0022134 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#include "assert.h"
void bli_l3_thrinfo_init_single
(
thrinfo_t* thread
)
{
bli_thrinfo_init_single( thread );
}
void bli_l3_thrinfo_free
(
rntm_t* rntm,
thrinfo_t* thread
)
{
bli_thrinfo_free( rntm, thread );
}
void bli_l3_sup_thrinfo_free
(
rntm_t* rntm,
thrinfo_t* thread
)
{
bli_thrinfo_free( rntm, thread );
}
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_create_root
(
dim_t id,
thrcomm_t* gl_comm,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t** thread
)
{
// Query the global communicator for the total number of threads to use.
dim_t n_threads = bli_thrcomm_num_threads( gl_comm );
// Use the thread id passed in as the global communicator id.
dim_t gl_comm_id = id;
// Use the blocksize id of the current (root) control tree node to
// query the top-most ways of parallelism to obtain.
bszid_t bszid = bli_cntl_bszid( cntl );
dim_t xx_way = bli_rntm_ways_for( bszid, rntm );
// Determine the work id for this thrinfo_t node.
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
// Create the root thrinfo_t node.
*thread = bli_thrinfo_create
(
rntm,
gl_comm,
gl_comm_id,
xx_way,
work_id,
TRUE,
bszid,
NULL
);
}
// -----------------------------------------------------------------------------
void bli_l3_sup_thrinfo_create_root
(
dim_t id,
thrcomm_t* gl_comm,
rntm_t* rntm,
thrinfo_t** thread
)
{
// Query the global communicator for the total number of threads to use.
dim_t n_threads = bli_thrcomm_num_threads( gl_comm );
// Use the thread id passed in as the global communicator id.
dim_t gl_comm_id = id;
// Use the BLIS_NC blocksize id to query the top-most ways of parallelism
// to obtain. Note that hard-coding BLIS_NC like this is a little bit of a
// hack, but it works fine since both of the sup algorithms (bp and pb) use
// the cache blocksizes down to the 3rd loop. (See the definitions of
// bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for
// a concise enumeration of these bszid_t ids.)
const bszid_t bszid = BLIS_NC;
dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm );
// Determine the work id for this thrinfo_t node.
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
// Create the root thrinfo_t node.
*thread = bli_thrinfo_create
(
rntm,
gl_comm,
gl_comm_id,
xx_way,
work_id,
TRUE,
bszid,
NULL
);
}
// -----------------------------------------------------------------------------
void bli_l3_sup_thrinfo_update_root
(
rntm_t* rntm,
thrinfo_t* thread
)
{
// Query the current root for the total number of threads to use.
const dim_t n_threads = bli_thread_num_threads( thread );
// Query the current root for the (global) comm id.
const dim_t gl_comm_id = bli_thread_ocomm_id( thread );
// Query the rntm_t for the updated number of ways of parallelism.
const dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm );
// Recompute the work id for this thrinfo_t node using the updated
// number of ways of parallelism.
dim_t work_id = gl_comm_id / ( n_threads / xx_way );
// Save the updated ways of parallelism and work id to the thrinfo_t node.
bli_thrinfo_set_n_way( xx_way, thread );
bli_thrinfo_set_work_id( work_id, thread );
}
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_print_gemm_paths
(
thrinfo_t** threads
)
{
// In order to query the number of threads, we query the only thread we
// know exists: thread 0.
dim_t n_threads = bli_thread_num_threads( threads[0] );
// For the purposes of printing the "header" information that is common
// to the various instances of a thrinfo_t (ie: across all threads), we
// choose the last thread in case the problem is so small that there is
// only an "edge" case, which will always be assigned to the last thread
// (at least for higher levels of partitioning).
thrinfo_t* jc_info = threads[n_threads-1];
thrinfo_t* pc_info = NULL;
thrinfo_t* pb_info = NULL;
thrinfo_t* ic_info = NULL;
thrinfo_t* pa_info = NULL;
thrinfo_t* jr_info = NULL;
thrinfo_t* ir_info = NULL;
// Initialize the n_ways and n_threads fields of each thrinfo_t "level"
// to -1. More than likely, these will all be overwritten with meaningful
// values, but in case some thrinfo_t trees are not fully built (see
// next commnet), these will be the placeholder values.
dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1,
pa_way = -1, jr_way = -1, ir_way = -1;
dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1,
pa_nt = -1, jr_nt = -1, ir_nt = -1;
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
// even one unit of work (where as unit is usually a single micropanel's
// width, MR or NR).
if ( !jc_info ) goto print_header;
jc_way = bli_thread_n_way( jc_info );
jc_nt = bli_thread_num_threads( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_header;
pc_way = bli_thread_n_way( pc_info );
pc_nt = bli_thread_num_threads( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_header;
pb_way = bli_thread_n_way( pb_info );
pb_nt = bli_thread_num_threads( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_header;
ic_way = bli_thread_n_way( ic_info );
ic_nt = bli_thread_num_threads( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
if ( !pa_info ) goto print_header;
pa_way = bli_thread_n_way( pa_info );
pa_nt = bli_thread_num_threads( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_header;
jr_way = bli_thread_n_way( jr_info );
jr_nt = bli_thread_num_threads( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_header;
ir_way = bli_thread_n_way( ir_info );
ir_nt = bli_thread_num_threads( ir_info );
print_header:
printf( " jc kc pb ic pa jr ir\n" );
printf( "xx_nt: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( unsigned long )jc_nt,
( unsigned long )pc_nt,
( unsigned long )pb_nt,
( unsigned long )ic_nt,
( unsigned long )pa_nt,
( unsigned long )jr_nt,
( unsigned long )ir_nt );
printf( "xx_way: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( unsigned long )jc_way,
( unsigned long )pc_way,
( unsigned long )pb_way,
( unsigned long )ic_way,
( unsigned long )pa_way,
( unsigned long )jr_way,
( unsigned long )ir_way );
printf( "============================================\n" );
for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
{
jc_info = threads[gl_id];
dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1,
pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1;
dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1,
pa_work_id = -1, jr_work_id = -1, ir_work_id = -1;
if ( !jc_info ) goto print_thrinfo;
jc_comm_id = bli_thread_ocomm_id( jc_info );
jc_work_id = bli_thread_work_id( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_thrinfo;
pc_comm_id = bli_thread_ocomm_id( pc_info );
pc_work_id = bli_thread_work_id( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_thrinfo;
pb_comm_id = bli_thread_ocomm_id( pb_info );
pb_work_id = bli_thread_work_id( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_thrinfo;
ic_comm_id = bli_thread_ocomm_id( ic_info );
ic_work_id = bli_thread_work_id( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
if ( !pa_info ) goto print_thrinfo;
pa_comm_id = bli_thread_ocomm_id( pa_info );
pa_work_id = bli_thread_work_id( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_thrinfo;
jr_comm_id = bli_thread_ocomm_id( jr_info );
jr_work_id = bli_thread_work_id( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_thrinfo;
ir_comm_id = bli_thread_ocomm_id( ir_info );
ir_work_id = bli_thread_work_id( ir_info );
print_thrinfo:
printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( long )jc_comm_id,
( long )pc_comm_id,
( long )pb_comm_id,
( long )ic_comm_id,
( long )pa_comm_id,
( long )jr_comm_id,
( long )ir_comm_id );
printf( "work ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n",
( long )jc_work_id,
( long )pc_work_id,
( long )pb_work_id,
( long )ic_work_id,
( long )pa_work_id,
( long )jr_work_id,
( long )ir_work_id );
printf( "--------------------------------------------\n" );
}
}
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_print_trsm_paths
(
thrinfo_t** threads
)
{
// In order to query the number of threads, we query the only thread we
// know exists: thread 0.
dim_t n_threads = bli_thread_num_threads( threads[0] );
// For the purposes of printing the "header" information that is common
// to the various instances of a thrinfo_t (ie: across all threads), we
// choose the last thread in case the problem is so small that there is
// only an "edge" case, which will always be assigned to the last thread
// (at least for higher levels of partitioning).
thrinfo_t* jc_info = threads[n_threads-1];
thrinfo_t* pc_info = NULL;
thrinfo_t* pb_info = NULL;
thrinfo_t* ic_info = NULL;
thrinfo_t* pa_info = NULL; thrinfo_t* pa_info0 = NULL;
thrinfo_t* jr_info = NULL; thrinfo_t* jr_info0 = NULL;
thrinfo_t* ir_info = NULL; thrinfo_t* ir_info0 = NULL;
// Initialize the n_ways and n_threads fields of each thrinfo_t "level"
// to -1. More than likely, these will all be overwritten with meaningful
// values, but in case some thrinfo_t trees are not fully built (see
// next commnet), these will be the placeholder values.
dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1,
pa_way = -1, jr_way = -1, ir_way = -1,
pa_way0 = -1, jr_way0 = -1, ir_way0 = -1;
dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1,
pa_nt = -1, jr_nt = -1, ir_nt = -1,
pa_nt0 = -1, jr_nt0 = -1, ir_nt0 = -1;
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
// even one unit of work (where as unit is usually a single micropanel's
// width, MR or NR).
if ( !jc_info ) goto print_header;
jc_way = bli_thread_n_way( jc_info );
jc_nt = bli_thread_num_threads( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_header;
pc_way = bli_thread_n_way( pc_info );
pc_nt = bli_thread_num_threads( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_header;
pb_way = bli_thread_n_way( pb_info );
pb_nt = bli_thread_num_threads( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_header;
ic_way = bli_thread_n_way( ic_info );
ic_nt = bli_thread_num_threads( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
pa_info0 = bli_thrinfo_sub_prenode( ic_info );
// check_header_prenode:
if ( !pa_info0 ) goto check_header_node;
pa_way0 = bli_thread_n_way( pa_info0 );
pa_nt0 = bli_thread_num_threads( pa_info0 );
jr_info0 = bli_thrinfo_sub_node( pa_info0 );
if ( !jr_info0 ) goto check_header_node;
jr_way0 = bli_thread_n_way( jr_info0 );
jr_nt0 = bli_thread_num_threads( jr_info0 );
ir_info0 = bli_thrinfo_sub_node( jr_info0 );
if ( !ir_info0 ) goto check_header_node;
ir_way0 = bli_thread_n_way( ir_info0 );
ir_nt0 = bli_thread_num_threads( ir_info0 );
check_header_node:
if ( !pa_info ) goto print_header;
pa_way = bli_thread_n_way( pa_info );
pa_nt = bli_thread_num_threads( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_header;
jr_way = bli_thread_n_way( jr_info );
jr_nt = bli_thread_num_threads( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_header;
ir_way = bli_thread_n_way( ir_info );
ir_nt = bli_thread_num_threads( ir_info );
print_header:
printf( " jc kc pb ic pa jr ir\n" );
printf( "xx_nt: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n",
( long )jc_nt,
( long )pc_nt,
( long )pb_nt,
( long )ic_nt,
( long )pa_nt0, ( long )pa_nt,
( long )jr_nt0, ( long )jr_nt,
( long )ir_nt0, ( long )ir_nt );
printf( "xx_way: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n",
( long )jc_way,
( long )pc_way,
( long )pb_way,
( long )ic_way,
( long )pa_way0, ( long )pa_way,
( long )jr_way0, ( long )jr_way,
( long )ir_way0, ( long )ir_way );
printf( "==================================================\n" );
for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id )
{
jc_info = threads[gl_id];
#if 1
// NOTE: This cpp branch contains code that is safe to execute
// for small problems that are parallelized enough that one or
// more threads gets no work.
dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1,
pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1,
pa_comm_id0 = -1, jr_comm_id0 = -1, ir_comm_id0 = -1;
dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1,
pa_work_id = -1, jr_work_id = -1, ir_work_id = -1,
pa_work_id0 = -1, jr_work_id0 = -1, ir_work_id0 = -1;
if ( !jc_info ) goto print_thrinfo;
jc_comm_id = bli_thread_ocomm_id( jc_info );
jc_work_id = bli_thread_work_id( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info ) goto print_thrinfo;
pc_comm_id = bli_thread_ocomm_id( pc_info );
pc_work_id = bli_thread_work_id( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info ) goto print_thrinfo;
pb_comm_id = bli_thread_ocomm_id( pb_info );
pb_work_id = bli_thread_work_id( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info ) goto print_thrinfo;
ic_comm_id = bli_thread_ocomm_id( ic_info );
ic_work_id = bli_thread_work_id( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
pa_info0 = bli_thrinfo_sub_prenode( ic_info );
// check_thrinfo_prenode:
if ( !pa_info0 ) goto check_thrinfo_node;
pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
pa_work_id0 = bli_thread_work_id( pa_info0 );
jr_info0 = bli_thrinfo_sub_node( pa_info0 );
if ( !jr_info0 ) goto check_thrinfo_node;
jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
jr_work_id0 = bli_thread_work_id( jr_info0 );
ir_info0 = bli_thrinfo_sub_node( jr_info0 );
if ( !ir_info0 ) goto check_thrinfo_node;
ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
ir_work_id0 = bli_thread_work_id( ir_info0 );
check_thrinfo_node:
if ( !pa_info ) goto print_thrinfo;
pa_comm_id = bli_thread_ocomm_id( pa_info );
pa_work_id = bli_thread_work_id( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info ) goto print_thrinfo;
jr_comm_id = bli_thread_ocomm_id( jr_info );
jr_work_id = bli_thread_work_id( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info ) goto print_thrinfo;
ir_comm_id = bli_thread_ocomm_id( ir_info );
ir_work_id = bli_thread_work_id( ir_info );
print_thrinfo:
#else
dim_t jc_comm_id;
dim_t pc_comm_id;
dim_t pb_comm_id;
dim_t ic_comm_id;
dim_t pa_comm_id0, pa_comm_id;
dim_t jr_comm_id0, jr_comm_id;
dim_t ir_comm_id0, ir_comm_id;
dim_t jc_work_id;
dim_t pc_work_id;
dim_t pb_work_id;
dim_t ic_work_id;
dim_t pa_work_id0, pa_work_id;
dim_t jr_work_id0, jr_work_id;
dim_t ir_work_id0, ir_work_id;
// NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads
// may not fully build their thrinfo_t structures--specifically when the
// dimension being parallelized is not large enough for each thread to have
// even one unit of work (where as unit is usually a single micropanel's
// width, MR or NR).
if ( !jc_info )
{
jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
jc_comm_id = bli_thread_ocomm_id( jc_info );
jc_work_id = bli_thread_work_id( jc_info );
pc_info = bli_thrinfo_sub_node( jc_info );
if ( !pc_info )
{
pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
pc_comm_id = bli_thread_ocomm_id( pc_info );
pc_work_id = bli_thread_work_id( pc_info );
pb_info = bli_thrinfo_sub_node( pc_info );
if ( !pb_info )
{
pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
pb_comm_id = bli_thread_ocomm_id( pb_info );
pb_work_id = bli_thread_work_id( pb_info );
ic_info = bli_thrinfo_sub_node( pb_info );
if ( !ic_info )
{
ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1;
ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
ic_comm_id = bli_thread_ocomm_id( ic_info );
ic_work_id = bli_thread_work_id( ic_info );
pa_info0 = bli_thrinfo_sub_prenode( ic_info );
pa_info = bli_thrinfo_sub_node( ic_info );
// Prenode
if ( !pa_info0 )
{
pa_comm_id0 = jr_comm_id0 = ir_comm_id0 = -1;
pa_work_id0 = jr_work_id0 = ir_work_id0 = -1;
}
else
{
pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
pa_work_id0 = bli_thread_work_id( pa_info0 );
jr_info0 = bli_thrinfo_sub_node( pa_info0 );
if ( !jr_info0 )
{
jr_comm_id0 = ir_comm_id0 = -1;
jr_work_id0 = ir_work_id0 = -1;
}
else
{
jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
jr_work_id0 = bli_thread_work_id( jr_info0 );
ir_info0 = bli_thrinfo_sub_node( jr_info0 );
if ( !ir_info0 )
{
ir_comm_id0 = -1;
ir_work_id0 = -1;
}
else
{
ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
ir_work_id0 = bli_thread_work_id( ir_info0 );
}
}
}
// Main node
if ( !pa_info )
{
pa_comm_id = jr_comm_id = ir_comm_id = -1;
pa_work_id = jr_work_id = ir_work_id = -1;
}
else
{
pa_comm_id = bli_thread_ocomm_id( pa_info );
pa_work_id = bli_thread_work_id( pa_info );
jr_info = bli_thrinfo_sub_node( pa_info );
if ( !jr_info )
{
jr_comm_id = ir_comm_id = -1;
jr_work_id = ir_work_id = -1;
}
else
{
jr_comm_id = bli_thread_ocomm_id( jr_info );
jr_work_id = bli_thread_work_id( jr_info );
ir_info = bli_thrinfo_sub_node( jr_info );
if ( !ir_info )
{
ir_comm_id = -1;
ir_work_id = -1;
}
else
{
ir_comm_id = bli_thread_ocomm_id( ir_info );
ir_work_id = bli_thread_work_id( ir_info );
}
}
}
}
}
}
}
#endif
printf( "comm ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n",
( long )jc_comm_id,
( long )pc_comm_id,
( long )pb_comm_id,
( long )ic_comm_id,
( long )pa_comm_id0, ( long )pa_comm_id,
( long )jr_comm_id0, ( long )jr_comm_id,
( long )ir_comm_id0, ( long )ir_comm_id );
printf( "work ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n",
( long )jc_work_id,
( long )pc_work_id,
( long )pb_work_id,
( long )ic_work_id,
( long )pa_work_id0, ( long )pa_work_id,
( long )jr_work_id0, ( long )jr_work_id,
( long )ir_work_id0, ( long )ir_work_id );
printf( "--------------------------------------------------\n" );
}
}
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_free_paths
(
rntm_t* rntm,
thrinfo_t** threads
)
{
dim_t n_threads = bli_thread_num_threads( threads[0] );
dim_t i;
for ( i = 0; i < n_threads; ++i )
bli_l3_thrinfo_free( rntm, threads[i] );
bli_free_intl( threads );
}
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_thrinfo.h 0000664 0000000 0000000 00000010277 14634250137 0022140 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// thrinfo_t macros specific to various level-3 operations.
//
// gemm
// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
// gemmt
// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
// trmm
// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to
// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR.
#define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc )
#define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc )
#define bli_trmm_my_iter_rr( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
// trsm
#define bli_trsm_my_iter_rr( index, thread ) \
\
( index % thread->n_way == thread->work_id % thread->n_way )
//
// thrinfo_t APIs specific to level-3 operations.
//
void bli_l3_thrinfo_init
(
thrinfo_t* thread,
thrcomm_t* ocomm,
dim_t ocomm_id,
dim_t n_way,
dim_t work_id,
thrinfo_t* sub_node
);
void bli_l3_thrinfo_init_single
(
thrinfo_t* thread
);
void bli_l3_thrinfo_free
(
rntm_t* rntm,
thrinfo_t* thread
);
void bli_l3_sup_thrinfo_free
(
rntm_t* rntm,
thrinfo_t* thread
);
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_create_root
(
dim_t id,
thrcomm_t* gl_comm,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t** thread
);
void bli_l3_sup_thrinfo_create_root
(
dim_t id,
thrcomm_t* gl_comm,
rntm_t* rntm,
thrinfo_t** thread
);
void bli_l3_sup_thrinfo_update_root
(
rntm_t* rntm,
thrinfo_t* thread
);
void bli_l3_thrinfo_print_gemm_paths
(
thrinfo_t** threads
);
void bli_l3_thrinfo_print_trsm_paths
(
thrinfo_t** threads
);
// -----------------------------------------------------------------------------
void bli_l3_thrinfo_free_paths
(
rntm_t* rntm,
thrinfo_t** threads
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr.h 0000664 0000000 0000000 00000004456 14634250137 0021272 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-3 micro-kernels.
//
// Note: Instead of defining function prototype macro templates and then
// instantiating those macros to define the individual function prototypes,
// we simply alias the official operations' prototypes as defined in
// bli_l3_ukr_prot.h.
#undef GENTPROT
#define GENTPROT GEMM_UKR_PROT
INSERT_GENTPROT_BASIC0( gemm_ukr_name )
#undef GENTPROT
#define GENTPROT GEMMTRSM_UKR_PROT
INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name )
INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name )
#undef GENTPROT
#define GENTPROT TRSM_UKR_PROT
INSERT_GENTPROT_BASIC0( trsm_l_ukr_name )
INSERT_GENTPROT_BASIC0( trsm_u_ukr_name )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_fpa.c 0000664 0000000 0000000 00000004173 14634250137 0022107 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define function pointer query interfaces.
//
#undef GENFRONT
#define GENFRONT( tname, opname ) \
\
GENARRAY_FPA( PASTECH2(tname,_ukr,_vft), \
opname ); \
\
PASTECH2(tname,_ukr,_vft) \
PASTEMAC(opname,_qfp)( num_t dt ) \
{ \
return PASTECH(opname,_fpa)[ dt ]; \
}
GENFRONT( gemm, gemm_ukernel )
GENFRONT( gemmtrsm, gemmtrsm_l_ukernel )
GENFRONT( gemmtrsm, gemmtrsm_u_ukernel )
GENFRONT( trsm, trsm_l_ukernel )
GENFRONT( trsm, trsm_u_ukernel )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_fpa.h 0000664 0000000 0000000 00000003757 14634250137 0022123 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype function pointer query interface.
//
#undef GENPROT
#define GENPROT( tname, opname ) \
\
PASTECH2(tname,_ukr,_vft) \
PASTEMAC(opname,_qfp)( num_t dt );
GENPROT( gemm, gemm_ukernel )
GENPROT( gemmtrsm, gemmtrsm_l_ukernel )
GENPROT( gemmtrsm, gemmtrsm_u_ukernel )
GENPROT( trsm, trsm_l_ukernel )
GENPROT( trsm, trsm_u_ukernel )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_oapi.c 0000664 0000000 0000000 00000015536 14634250137 0022276 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENFRONT
#define GENFRONT( tname, opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( c ); \
\
dim_t m = bli_obj_length( c ); \
dim_t n = bli_obj_width( c ); \
dim_t k = bli_obj_width( a ); \
void* buf_a = bli_obj_buffer_at_off( a ); \
void* buf_b = bli_obj_buffer_at_off( b ); \
void* buf_c = bli_obj_buffer_at_off( c ); \
inc_t rs_c = bli_obj_row_stride( c ); \
inc_t cs_c = bli_obj_col_stride( c ); \
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \
\
auxinfo_t data; \
\
/* Fill the auxinfo_t struct in case the micro-kernel uses it. */ \
bli_auxinfo_set_next_a( buf_a, &data ); \
bli_auxinfo_set_next_b( buf_b, &data ); \
bli_auxinfo_set_is_a( 1, &data ); \
bli_auxinfo_set_is_b( 1, &data ); \
\
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(tname,_ukr,_vft) f = \
PASTEMAC(opname,_qfp)( dt ); \
\
f \
( \
m, \
n, \
k, \
buf_alpha, \
buf_a, \
buf_b, \
buf_beta, \
buf_c, rs_c, cs_c, \
&data, \
cntx \
); \
} \
GENFRONT( gemm, gemm_ukernel )
#undef GENFRONT
#define GENFRONT( tname, opname, opnamel, opnameu ) \
\
void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a1x, \
obj_t* a11, \
obj_t* bx1, \
obj_t* b11, \
obj_t* c11, \
cntx_t* cntx \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( c11 ); \
\
dim_t m = bli_obj_length( c11 ); \
dim_t n = bli_obj_width( c11 ); \
dim_t k = bli_obj_width( a1x ); \
void* buf_a1x = bli_obj_buffer_at_off( a1x ); \
void* buf_a11 = bli_obj_buffer_at_off( a11 ); \
void* buf_bx1 = bli_obj_buffer_at_off( bx1 ); \
void* buf_b11 = bli_obj_buffer_at_off( b11 ); \
void* buf_c11 = bli_obj_buffer_at_off( c11 ); \
inc_t rs_c = bli_obj_row_stride( c11 ); \
inc_t cs_c = bli_obj_col_stride( c11 ); \
void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \
\
auxinfo_t data; \
\
/* Fill the auxinfo_t struct in case the micro-kernel uses it. */ \
if ( bli_obj_is_lower( a11 ) ) \
{ bli_auxinfo_set_next_a( buf_a1x, &data ); } \
else /* if ( bli_obj_is_upper( a11 ) ) */ \
{ bli_auxinfo_set_next_a( buf_a11, &data ); } \
bli_auxinfo_set_next_b( buf_bx1, &data ); \
\
/* Invoke the void pointer-based function for the given datatype. */ \
if ( bli_obj_is_lower( a11 ) ) \
{ \
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(tname,_ukr,_vft) f = \
PASTEMAC(opnamel,_qfp)( dt ); \
\
f \
( \
m, \
n, \
k, \
buf_alpha, \
buf_a1x, \
buf_a11, \
buf_bx1, \
buf_b11, \
buf_c11, rs_c, cs_c, \
&data, \
cntx \
); \
} \
else /* if ( bli_obj_is_upper( a11 ) ) */ \
{ \
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(tname,_ukr,_vft) f = \
PASTEMAC(opnameu,_qfp)( dt ); \
\
f \
( \
m, \
n, \
k, \
buf_alpha, \
buf_a1x, \
buf_a11, \
buf_bx1, \
buf_b11, \
buf_c11, rs_c, cs_c, \
&data, \
cntx \
); \
} \
} \
GENFRONT( gemmtrsm, gemmtrsm_ukernel, gemmtrsm_l_ukernel, gemmtrsm_u_ukernel )
#undef GENFRONT
#define GENFRONT( tname, opname, opnamel, opnameu ) \
\
void PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx \
) \
{ \
bli_init_once(); \
\
num_t dt = bli_obj_dt( c ); \
\
void* buf_a = bli_obj_buffer_at_off( a ); \
void* buf_b = bli_obj_buffer_at_off( b ); \
void* buf_c = bli_obj_buffer_at_off( c ); \
inc_t rs_c = bli_obj_row_stride( c ); \
inc_t cs_c = bli_obj_col_stride( c ); \
\
auxinfo_t data; \
\
/* Fill the auxinfo_t struct in case the micro-kernel uses it. */ \
bli_auxinfo_set_next_a( buf_a, &data ); \
bli_auxinfo_set_next_b( buf_b, &data ); \
bli_auxinfo_set_is_a( 1, &data ); \
bli_auxinfo_set_is_b( 1, &data ); \
\
/* Invoke the void pointer-based function for the given datatype. */ \
if ( bli_obj_is_lower( a ) ) \
{ \
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(tname,_ukr,_vft) f = \
PASTEMAC(opnamel,_qfp)( dt ); \
\
f \
( \
buf_a, \
buf_b, \
buf_c, rs_c, cs_c, \
&data, \
cntx \
); \
} \
else /* if ( bli_obj_is_upper( a ) ) */ \
{ \
/* Query a type-specific function pointer, except one that uses
void* for function arguments instead of typed pointers. */ \
PASTECH2(tname,_ukr,_vft) f = \
PASTEMAC(opnameu,_qfp)( dt ); \
\
f \
( \
buf_a, \
buf_b, \
buf_c, rs_c, cs_c, \
&data, \
cntx \
); \
} \
} \
GENFRONT( trsm, trsm_ukernel, trsm_l_ukernel, trsm_u_ukernel )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_oapi.h 0000664 0000000 0000000 00000004670 14634250137 0022300 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a, \
obj_t* b, \
obj_t* beta, \
obj_t* c, \
cntx_t* cntx \
);
GENPROT( gemm_ukernel )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* alpha, \
obj_t* a1x, \
obj_t* a11, \
obj_t* bx1, \
obj_t* b11, \
obj_t* c11, \
cntx_t* cntx \
);
GENPROT( gemmtrsm_ukernel )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx \
);
GENPROT( trsm_ukernel )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_prot.h 0000664 0000000 0000000 00000005766 14634250137 0022343 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Define template prototypes for level-3 micro-kernels.
//
#define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname)
#define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype_out* restrict alpha, \
ctype_in* restrict a, \
ctype_in* restrict b, \
ctype_out* restrict beta, \
ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
#define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a1x, \
ctype* restrict a11, \
ctype* restrict bx1, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
#define TRSM_UKR_PROT( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
);
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_tapi.c 0000664 0000000 0000000 00000010763 14634250137 0022300 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, tname, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for the function address of the current
datatype's micro-kernel. */ \
PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the typed function for the given datatype. */ \
f \
( \
m, \
n, \
k, \
alpha, \
a, \
b, \
beta, \
c, rs_c, cs_c, \
data, \
cntx \
); \
} \
INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, tname, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a1x, \
ctype* restrict a11, \
ctype* restrict bx1, \
ctype* restrict b11, \
ctype* restrict c11, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for the function address of the current
datatype's micro-kernel. */ \
PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the typed function for the given datatype. */ \
f \
( \
m, \
n, \
k, \
alpha, \
a1x, \
a11, \
bx1, \
b11, \
c11, rs_c, cs_c, \
data, \
cntx \
); \
} \
INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukernel, gemmtrsm, BLIS_GEMMTRSM_L_UKR )
INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ukernel, gemmtrsm, BLIS_GEMMTRSM_U_UKR )
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname, tname, kerid ) \
\
void PASTEMAC(ch,opname) \
( \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
bli_init_once(); \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for the function address of the current
datatype's micro-kernel. */ \
PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \
\
/* Invoke the typed function for the given datatype. */ \
f \
( \
a, \
b, \
c, rs_c, cs_c, \
data, \
cntx \
); \
} \
INSERT_GENTFUNC_BASIC2( trsm_l_ukernel, trsm, BLIS_TRSM_L_UKR )
INSERT_GENTFUNC_BASIC2( trsm_u_ukernel, trsm, BLIS_TRSM_U_UKR )
cython-blis-1.0.0/blis/_src/frame/3/bli_l3_ukr_tapi.h 0000664 0000000 0000000 00000004165 14634250137 0022304 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Generate prototypes for level-3 micro-kernel wrappers.
//
#undef gemm_ukr_name
#define gemm_ukr_name gemm_ukernel
#undef gemmtrsm_l_ukr_name
#define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel
#undef gemmtrsm_u_ukr_name
#define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel
#undef trsm_l_ukr_name
#define trsm_l_ukr_name trsm_l_ukernel
#undef trsm_u_ukr_name
#define trsm_u_ukr_name trsm_u_ukernel
// Include the level-3 micro-kernel API template.
#include "bli_l3_ukr.h"
cython-blis-1.0.0/blis/_src/frame/3/gemm/ 0000775 0000000 0000000 00000000000 14634250137 0020010 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm.h 0000664 0000000 0000000 00000003521 14634250137 0021735 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_gemm_cntl.h"
#include "bli_gemm_front.h"
#include "bli_gemm_var.h"
#include "bli_gemm_ind_opt.h"
// Mixed datatype support.
#ifdef BLIS_ENABLE_GEMM_MD
#include "bli_gemm_md.h"
#endif
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_blk_var1.c 0000664 0000000 0000000 00000006003 14634250137 0023507 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_blk_var1
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t a1, c1;
dim_t my_start, my_end;
dim_t b_alg;
// Determine the direction in which to partition (forwards or backwards).
dir_t direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_range_mdim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end
);
// Partition along the m dimension.
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize( direct, i, my_end, a,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Perform gemm subproblem.
bli_l3_int
(
&BLIS_ONE,
&a1,
b,
&BLIS_ONE,
&c1,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_blk_var2.c 0000664 0000000 0000000 00000006003 14634250137 0023510 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_blk_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t b1, c1;
dim_t my_start, my_end;
dim_t b_alg;
// Determine the direction in which to partition (forwards or backwards).
dir_t direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_range_ndim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end
);
// Partition along the n dimension.
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize( direct, i, my_end, b,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for B1 and C1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
i, b_alg, b, &b1 );
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Perform gemm subproblem.
bli_l3_int
(
&BLIS_ONE,
a,
&b1,
&BLIS_ONE,
&c1,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_blk_var3.c 0000664 0000000 0000000 00000010442 14634250137 0023513 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_blk_var3
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t a1, b1;
dim_t b_alg;
// Determine the direction in which to partition (forwards or backwards).
dir_t direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_k( a, b, c, cntl );
// Query dimension in partitioning direction.
dim_t k_trans = bli_obj_width_after_trans( a );
// Partition along the k dimension.
for ( dim_t i = 0; i < k_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b,
bli_cntl_bszid( cntl ), cntx, cntl );
// Acquire partitions for A1 and B1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, b, &b1 );
// Perform gemm subproblem.
bli_l3_int
(
&BLIS_ONE,
&a1,
&b1,
&BLIS_ONE,
c,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal beta scalar on matrix C is non-zero, we must use it
// only for the first iteration (and then BLIS_ONE for all others).
// And since c is a locally aliased obj_t (see _int() function), we
// can simply overwrite the internal beta scalar with BLIS_ONE once
// it has been used in the first iteration. However...
// Unlike variant 3 of gemm and gemmt, which reset the internal scalar
// on C at the end of the first iteration so that subsequent iterations
// do not erroneously apply beta more than once, it is important that
// this behavior not be applied to trmm. That is because the order of
// computation is always such that the beta that is passed into the
// macro-kernel must be zero, since the macro-kernel only applies that
// beta to (and thus overwrites) the row-panel of C that corresponds to
// the current block intersecting the diagonal. It turns out that this
// same pattern holds for trmm3 as well--except there, the beta scalar
// is potentially non-zero, but is still applied only to the current
// row-panel of C, and thus beta is applied to all of C exactly once.
// Thus, for neither trmm nor trmm3 should we reset the scalar on C
// after the first iteration.
if ( bli_cntl_family( cntl ) != BLIS_TRMM )
if ( i == 0 ) bli_obj_scalar_reset( c );
}
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_cntl.c 0000664 0000000 0000000 00000017270 14634250137 0022756 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cntl_t* bli_gemm_cntl_create
(
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b,
void_fp ker
)
{
return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker );
}
// -----------------------------------------------------------------------------
cntl_t* bli_gemmbp_cntl_create
(
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b,
void_fp ker
)
{
void_fp macro_kernel_fp;
// Choose the default macrokernel based on the operation family...
if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
else /* should never execute */ macro_kernel_fp = NULL;
// ...unless a non-NULL kernel function pointer is passed in, in which
// case we use that instead.
if ( ker ) macro_kernel_fp = ker;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
(
rntm, // the thread's runtime structure
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node
(
rntm, // the thread's runtime structure
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_fp,
gemm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_l3_packa, // pack the left-hand operand
BLIS_MR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
schema_a, // normally BLIS_PACKED_ROW_PANELS
BLIS_BUFFER_FOR_A_BLOCK,
gemm_cntl_bp_bu
);
// Create a node for partitioning the m dimension by MC.
cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node
(
rntm,
family,
BLIS_MC,
bli_gemm_blk_var1,
gemm_cntl_packa
);
// Create a node for packing matrix B.
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
(
rntm,
bli_l3_packb, // pack the right-hand operand
BLIS_NR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
schema_b, // normally BLIS_PACKED_COL_PANELS
BLIS_BUFFER_FOR_B_PANEL,
gemm_cntl_op_bp
);
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
(
rntm,
family,
BLIS_KC,
bli_gemm_blk_var3,
gemm_cntl_packb
);
// Create a node for partitioning the n dimension by NC.
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
(
rntm,
family,
BLIS_NC,
bli_gemm_blk_var2,
gemm_cntl_mm_op
);
return gemm_cntl_vl_mm;
}
// -----------------------------------------------------------------------------
// This control tree creation function is disabled because it is no longer used.
// (It was originally created in the run up to publishing the 1m journal article,
// but was disabled to reduce complexity.)
#if 0
cntl_t* bli_gemmpb_cntl_create
(
opid_t family
)
{
void_fp macro_kernel_p = bli_gemm_ker_var1;
// Change the macro-kernel if the operation family is gemmt or trmm.
//if ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2;
//else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2;
// Create two nodes for the macro-kernel.
cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node
(
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node
(
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_ub_ke
);
// Create a node for packing matrix A (which is really the right-hand
// operand "B").
cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
(
bli_gemm_packb, // pack the right-hand operand
bli_packm_blk_var1,
BLIS_MR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_COL_PANELS,
BLIS_BUFFER_FOR_A_BLOCK,
gemm_cntl_pb_ub
);
// Create a node for partitioning the n dimension by MC.
cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node
(
family,
BLIS_MC,
bli_gemm_blk_var2,
gemm_cntl_packb
);
// Create a node for packing matrix B (which is really the left-hand
// operand "A").
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
bli_gemm_packa, // pack the left-hand operand
bli_packm_blk_var1,
BLIS_NR,
BLIS_KR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
BLIS_PACKED_ROW_PANELS,
BLIS_BUFFER_FOR_B_PANEL,
gemm_cntl_op_pb
);
// Create a node for partitioning the k dimension by KC.
cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
(
family,
BLIS_KC,
bli_gemm_blk_var3,
gemm_cntl_packa
);
// Create a node for partitioning the m dimension by NC.
cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
(
family,
BLIS_NC,
bli_gemm_blk_var1,
gemm_cntl_mm_op
);
return gemm_cntl_vl_mm;
}
#endif
// -----------------------------------------------------------------------------
void bli_gemm_cntl_free
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_cntl_free( rntm, cntl, thread );
}
// -----------------------------------------------------------------------------
cntl_t* bli_gemm_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
cntl_t* sub_node
)
{
return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_cntl.h 0000664 0000000 0000000 00000005140 14634250137 0022754 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
cntl_t* bli_gemm_cntl_create
(
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b,
void_fp ker
);
// -----------------------------------------------------------------------------
cntl_t* bli_gemmbp_cntl_create
(
rntm_t* rntm,
opid_t family,
pack_t schema_a,
pack_t schema_b,
void_fp ker
);
#if 0
cntl_t* bli_gemmpb_cntl_create
(
opid_t family,
);
#endif
// -----------------------------------------------------------------------------
void bli_gemm_cntl_free
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
// -----------------------------------------------------------------------------
cntl_t* bli_gemm_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
cntl_t* sub_node
);
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_front.c 0000664 0000000 0000000 00000023152 14634250137 0023142 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemm_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
#if 0
#ifdef BLIS_ENABLE_SMALL_MATRIX
// Only handle small problems separately for homogeneous datatypes.
if ( bli_obj_dt( a ) == bli_obj_dt( b ) &&
bli_obj_dt( a ) == bli_obj_dt( c ) &&
bli_obj_comp_prec( c ) == bli_obj_prec( c ) )
{
err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl );
if ( status == BLIS_SUCCESS ) return;
}
#endif
#endif
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
#ifdef BLIS_ENABLE_GEMM_MD
cntx_t cntx_local;
// If any of the storage datatypes differ, or if the computation precision
// differs from the storage precision of C, utilize the mixed datatype
// code path.
// NOTE: If we ever want to support the caller setting the computation
// domain explicitly, we will need to check the computation dt against the
// storage dt of C (instead of the computation precision against the
// storage precision of C).
if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
{
// Handle mixed datatype cases in bli_gemm_md(), which may modify
// the objects or the context. (If the context is modified, cntx
// is adjusted to point to cntx_local.)
bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
}
#endif
// Next, we handle the possibility of needing to typecast alpha to the
// computation datatype and/or beta to the storage datatype of C.
// Attach alpha to B, and in the process typecast alpha to the target
// datatype of the matrix (which in this case is equal to the computation
// datatype).
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
// Attach beta to C, and in the process typecast beta to the target
// datatype of the matrix (which in this case is equal to the storage
// datatype of C).
bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta, &c_local );
// Change the alpha and beta pointers to BLIS_ONE since the values have
// now been typecast and attached to the matrices above.
alpha = &BLIS_ONE;
beta = &BLIS_ONE;
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
obj_t* cp = &c_local;
obj_t* betap = beta;
#ifdef BLIS_ENABLE_GEMM_MD
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
// If any of the following conditions are met, create a temporary matrix
// conformal to C into which we will accumulate the matrix product:
// - the storage precision of C differs from the computation precision;
// - the domains are mixed as crr;
// - the storage format of C does not match the preferred orientation
// of the ccr or crc cases.
// Then, after the computation is complete, this matrix will be copied
// or accumulated back to C.
const bool is_ccr_mismatch =
( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) &&
!bli_obj_is_col_stored( &c_local ) );
const bool is_crc_mismatch =
( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) &&
!bli_obj_is_row_stored( &c_local ) );
obj_t ct;
bool use_ct = FALSE;
// FGVZ: Consider adding another guard here that only creates and uses a
// temporary matrix for accumulation if k < c * kc, where c is some small
// constant like 2. And don't forget to use the same conditional for the
// castm() and free() at the end.
if (
bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
is_ccr_mismatch ||
is_crc_mismatch
)
{
use_ct = TRUE;
}
// If we need a temporary matrix conformal to C for whatever reason,
// we create it and prepare to use it now.
if ( use_ct )
{
const dim_t m = bli_obj_length( &c_local );
const dim_t n = bli_obj_width( &c_local );
inc_t rs = bli_obj_row_stride( &c_local );
inc_t cs = bli_obj_col_stride( &c_local );
num_t dt_ct = bli_obj_domain( &c_local ) |
bli_obj_comp_prec( &c_local );
// When performing the crr case, accumulate to a contiguously-stored
// real matrix so we do not have to repeatedly update C with general
// stride.
if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
// When performing the mismatched ccr or crc cases, now is the time
// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
// microkernel can output directly to C (instead of using a temporary
// microtile).
if ( is_ccr_mismatch ) { rs = 1; cs = m; }
else if ( is_crc_mismatch ) { rs = n; cs = 1; }
bli_obj_create( dt_ct, m, n, rs, cs, &ct );
const num_t dt_exec = bli_obj_exec_dt( &c_local );
const num_t dt_comp = bli_obj_comp_dt( &c_local );
bli_obj_set_target_dt( dt_ct, &ct );
bli_obj_set_exec_dt( dt_exec, &ct );
bli_obj_set_comp_dt( dt_comp, &ct );
// A naive approach would cast C to the comptuation datatype,
// compute with beta, and then cast the result back to the
// user-provided output matrix. However, we employ a different
// approach that halves the number of memops on C (or its
// typecast temporary) by writing the A*B product directly to
// temporary storage, and then using xpbym to scale the
// output matrix by beta and accumulate/cast the A*B product.
//bli_castm( &c_local, &ct );
betap = &BLIS_ZERO;
cp = &ct;
}
#endif
#endif
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
betap,
cp,
cntx,
rntm,
cntl
);
#ifdef BLIS_ENABLE_GEMM_MD
#ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
// If we created a temporary matrix conformal to C for whatever reason,
// we copy/accumulate the result back to C and then release the object.
if ( use_ct )
{
obj_t beta_local;
bli_obj_scalar_detach( &c_local, &beta_local );
//bli_castnzm( &ct, &c_local );
bli_xpbym( &ct, &beta_local, &c_local );
bli_obj_free( &ct );
}
#endif
#endif
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_front.h 0000664 0000000 0000000 00000004042 14634250137 0023144 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemm_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
#ifdef BLIS_ENABLE_SMALL_MATRIX
err_t bli_gemm_small
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
cntl_t* cntl
);
#endif
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_ker_var1.c 0000664 0000000 0000000 00000004262 14634250137 0023525 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if 0
#include "blis.h"
void bli_gemm_ker_var1
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Implement _ker_var1() in terms of _ker_var2() by transposing the
// entire suboperation (which also requires swapping A and B).
bli_obj_induce_trans( a );
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
bli_gemm_ker_var2( b, a, c, cntx, rntm, cntl, thread );
}
#endif
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_ker_var2.c 0000664 0000000 0000000 00000026153 14634250137 0023531 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
typedef void (*xpbys_mxn_vft)
(
dim_t m,
dim_t n,
void* x, inc_t rs_x, inc_t cs_x,
void* b,
void* y, inc_t rs_y, inc_t cs_y
);
#undef GENTFUNC2
#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \
\
void PASTEMAC2(chx,chy,op) \
( \
dim_t m, \
dim_t n, \
void* x, inc_t rs_x, inc_t cs_x, \
void* b, \
void* y, inc_t rs_y, inc_t cs_y \
) \
{ \
ctypex* restrict x_cast = x; \
ctypey* restrict b_cast = b; \
ctypey* restrict y_cast = y; \
\
PASTEMAC3(chx,chy,chy,xpbys_mxn) \
( \
m, n, \
x_cast, rs_x, cs_x, \
b_cast, \
y_cast, rs_y, cs_y \
); \
}
INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn);
INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn);
static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn);
void bli_gemm_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
num_t dt_c = bli_obj_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
char* a_cast = bli_obj_buffer_at_off( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
char* b_cast = bli_obj_buffer_at_off( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
char* c_cast = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
// If any dimension is zero, return immediately.
if ( bli_zero_dim3( m, n, k ) ) return;
// Detach and multiply the scalars attached to A and B.
// NOTE: We know that the internal scalars of A and B are already of the
// target datatypes because the necessary typecasting would have already
// taken place during bli_packm_init().
obj_t scalar_a;
obj_t scalar_b;
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
// NOTE: We know that scalar_b is of type dt_exec due to the above code
// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
// and we know that the internal scalar in C is already of the type dt_c
// due to the casting in the implementation of bli_obj_scalar_attach().
char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
char* beta_cast = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
// Only employ this optimization if the storage datatype of C is
// equal to the execution/computation datatype.
#if 1
if ( bli_cntx_method( cntx ) == BLIS_1M )
{
bli_gemm_ind_recast_1m_params
(
&dt_exec,
&dt_c,
schema_a,
c,
&m, &n, &k,
&pd_a, &ps_a,
&pd_b, &ps_b,
&rs_c, &cs_c
);
}
#endif
#ifdef BLIS_ENABLE_GEMM_MD
// Tweak parameters in select mixed domain cases (rcc, crc, ccr).
if ( bli_cntx_method( cntx ) == BLIS_NAT )
{
bli_gemm_md_ker_var2_recast
(
&dt_exec,
bli_obj_dt( a ),
bli_obj_dt( b ),
&dt_c,
&m, &n, &k,
&pd_a, &ps_a,
&pd_b, &ps_b,
c,
&rs_c, &cs_c
);
}
#endif
siz_t dt_size = bli_dt_size( dt_exec );
siz_t dt_c_size = bli_dt_size( dt_c );
// Alias some constants to simpler names.
const dim_t MR = pd_a;
const dim_t NR = pd_b;
//const dim_t PACKMR = cs_a;
//const dim_t PACKNR = rs_b;
// Query the context for the micro-kernel address and cast it to its
// function pointer type.
gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx );
// Query the params field from the obj_t. If it is non-NULL, grab the ukr
// field of the params struct. If that function pointer is non-NULL, use it
// as our microkernel instead of the default microkernel queried from the
// cntx above.
gemm_ker_params_t* params = bli_obj_ker_params( c );
gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
if ( user_ukr ) gemm_ukr = user_ukr;
// Temporary C buffer for edge cases. Note that the strides of this
// temporary buffer are set so that they match the storage of the
// original C matrix. For example, if C is column-stored, ct will be
// column-stored as well.
char ct[ BLIS_STACK_BUF_MAX_SIZE ]
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx );
const inc_t rs_ct = ( col_pref ? 1 : NR );
const inc_t cs_ct = ( col_pref ? MR : 1 );
char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
//
// Assumptions/assertions:
// rs_a == 1
// cs_a == PACKMR
// pd_a == MR
// ps_a == stride to next micro-panel of A
// rs_b == PACKNR
// cs_b == 1
// pd_b == NR
// ps_b == stride to next micro-panel of B
// rs_c == (no assumptions)
// cs_c == (no assumptions)
//
// Compute number of primary and leftover components of the m and n
// dimensions.
dim_t n_iter = n / NR;
dim_t n_left = n % NR;
dim_t m_iter = m / MR;
dim_t m_left = m % MR;
if ( n_left ) ++n_iter;
if ( m_left ) ++m_iter;
// Determine some increments used to step through A, B, and C.
inc_t rstep_a = ps_a * dt_size;
inc_t cstep_b = ps_b * dt_size;
inc_t rstep_c = rs_c * MR * dt_c_size;
inc_t cstep_c = cs_c * NR * dt_c_size;
auxinfo_t aux;
// Save the pack schemas of A and B to the auxinfo_t object.
bli_auxinfo_set_schema_a( schema_a, &aux );
bli_auxinfo_set_schema_b( schema_b, &aux );
// Save the imaginary stride of A and B to the auxinfo_t object.
bli_auxinfo_set_is_a( is_a, &aux );
bli_auxinfo_set_is_b( is_b, &aux );
// Save the virtual microkernel address and the params.
bli_auxinfo_set_ukr( gemm_ukr, &aux );
bli_auxinfo_set_params( params, &aux );
// The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
// loop around the microkernel. Here we query the thrinfo_t node for the
// 1st (ir) loop around the microkernel.
thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
// Query the number of threads and thread ids for each loop.
dim_t jr_nt = bli_thread_n_way( thread );
dim_t jr_tid = bli_thread_work_id( thread );
dim_t ir_nt = bli_thread_n_way( caucus );
dim_t ir_tid = bli_thread_work_id( caucus );
dim_t jr_start, jr_end;
dim_t ir_start, ir_end;
dim_t jr_inc, ir_inc;
// Determine the thread range and increment for the 2nd and 1st loops.
// NOTE: The definition of bli_thread_range_jrir() will depend on whether
// slab or round-robin partitioning was requested at configure-time.
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );
// Loop over the n dimension (NR columns at a time).
for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
{
char* b1 = b_cast + j * cstep_b;
char* c1 = c_cast + j * cstep_c;
dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
// Initialize our next panel of B to be the current panel of B.
char* b2 = b1;
// Loop over the m dimension (MR rows at a time).
for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
{
char* a1 = a_cast + i * rstep_a;
char* c11 = c1 + i * rstep_c;
dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
// Compute the addresses of the next panels of A and B.
char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) )
{
a2 = a_cast;
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc );
if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) )
b2 = b_cast;
}
// Save addresses of next panels of A and B to the auxinfo_t
// object.
bli_auxinfo_set_next_a( a2, &aux );
bli_auxinfo_set_next_b( b2, &aux );
// Edge case handling now occurs within the microkernel itself, but
// we must still explicitly accumulate to a temporary microtile in
// situations where a virtual microkernel is being used, such as
// during the 1m method or some cases of mixed datatypes.
if ( dt_exec == dt_c )
{
// Invoke the gemm micro-kernel.
gemm_ukr
(
m_cur,
n_cur,
k,
alpha_cast,
a1,
b1,
beta_cast,
c11, rs_c, cs_c,
&aux,
cntx
);
}
else
{
// Invoke the gemm micro-kernel.
gemm_ukr
(
MR,
NR,
k,
alpha_cast,
a1,
b1,
zero,
&ct, rs_ct, cs_ct,
&aux,
cntx
);
// Accumulate to C with type-casting.
xbpys_mxn[ dt_exec ][ dt_c ]
(
m_cur, n_cur,
&ct, rs_ct, cs_ct,
beta_cast,
c11, rs_c, cs_c
);
}
}
}
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" );
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" );
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" );
*/
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_md.c 0000664 0000000 0000000 00000051743 14634250137 0022421 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_GEMM_MD
void bli_gemm_md
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
const bool a_is_real = bli_obj_is_real( a );
const bool a_is_comp = bli_obj_is_complex( a );
const bool b_is_real = bli_obj_is_real( b );
const bool b_is_comp = bli_obj_is_complex( b );
const bool c_is_real = bli_obj_is_real( c );
const bool c_is_comp = bli_obj_is_complex( c );
if ( c_is_real && a_is_real && b_is_real )
{
// C_real += A_real * B_real
doms = bli_gemm_md_rrr( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_comp && a_is_comp && b_is_comp )
{
// C_complex += A_complex * B_complex
doms = bli_gemm_md_ccc( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_comp && a_is_comp && b_is_real )
{
// C_complex += A_complex * B_real
doms = bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_comp && a_is_real && b_is_comp )
{
// C_complex += A_real * B_complex
doms = bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_real && a_is_comp && b_is_comp )
{
// C_real += A_complex * B_complex
doms = bli_gemm_md_rcc( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_comp && a_is_real && b_is_real )
{
// C_complex += A_real * B_real
doms = bli_gemm_md_crr( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_real && a_is_comp && b_is_real )
{
// C_real += A_complex * B_real
doms = bli_gemm_md_rcr( a, b, beta, c, cntx_local, cntx );
}
else if ( c_is_real && a_is_real && b_is_comp )
{
// C_real += A_real * B_complex
doms = bli_gemm_md_rrc( a, b, beta, c, cntx_local, cntx );
}
else
{
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// This should never execute.
bli_abort();
}
// Extract the computation and execution domains from the struct
// returned above.
dom_t dom_comp = doms.comp;
dom_t dom_exec = doms.exec;
// Inspect the computation precision of C. (The user may have set
// this explicitly to request the precision in which the computation
// should take place.)
prec_t prec_comp = bli_obj_comp_prec( c );
// The computation precision tells us the target precision of A and B.
// NOTE: We don't set the target domain here. The target domain would
// either be unchanged, or would have been changed in one of the eight
// domain cases above.
bli_obj_set_target_prec( prec_comp, a );
bli_obj_set_target_prec( prec_comp, b );
// Combine the execution domain with the computation precision to form
// the execution datatype. (The computation precision and execution
// precision are always equal.)
num_t dt_exec = dom_exec | prec_comp;
// Set the execution datatypes of A, B, and C.
bli_obj_set_exec_dt( dt_exec, a );
bli_obj_set_exec_dt( dt_exec, b );
bli_obj_set_exec_dt( dt_exec, c );
// Combine the computation precision and computation domain to form the
// computation datatype.
num_t dt_comp = dom_comp | prec_comp;
// Set the computation datatypes of A, B, and C.
bli_obj_set_comp_dt( dt_comp, a );
bli_obj_set_comp_dt( dt_comp, b );
bli_obj_set_comp_dt( dt_comp, c );
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_ccr
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
// We assume that the requested computation domain is complex.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_COMPLEX;
// For ccr, the computation (ukernel) will be real, but the execution
// will appear complex to other parts of the implementation.
doms.comp = BLIS_REAL;
doms.exec = BLIS_COMPLEX;
// Here we construct the computation datatype, which for the ccr case
// is equal to the real projection of the execution datatype, and use
// that computation datatype to query the corresponding ukernel output
// preference.
const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
const bool row_pref
= bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
// We can only perform this case of mixed-domain gemm, C += A*B where
// B is real, if the microkernel prefers column output. If it prefers
// row output, we must induce a transposition and perform C += A*B
// where A (formerly B) is real.
if ( row_pref )
{
bli_obj_swap( a, b );
bli_obj_induce_trans( a );
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
// We must swap the pack schemas because the schemas were set before
// the objects were swapped.
bli_obj_swap_pack_schemas( a, b );
return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
}
// Create a local copy of the context and then prepare to use this
// context instead of the one passed in.
*cntx_local = **cntx;
*cntx = cntx_local;
// Copy the real domain blocksizes into the slots of their complex
// counterparts.
blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
// Halve both the real and complex MR's (which are both real MR's).
bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mr );
bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mr );
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr );
// Halve both the real and complex MC's (which are both real MC's).
bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mc );
bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mc );
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
// Use the default pack schemas in the objects.
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
// Rather than check which complex datatype dt_comp refers to, we set
// the mixed-domain virtual microkernel for both types.
bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_crc
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
// We assume that the requested computation domain is complex.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_COMPLEX;
// For crc, the computation (ukernel) will be real, but the execution
// will appear complex to other parts of the implementation.
doms.comp = BLIS_REAL;
doms.exec = BLIS_COMPLEX;
// Here we construct the computation datatype, which for the crc case
// is equal to the real projection of the execution datatype, and use
// that computation datatype to query the corresponding ukernel output
// preference.
const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
const bool col_pref
= bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
// We can only perform this case of mixed-domain gemm, C += A*B where
// A is real, if the microkernel prefers row output. If it prefers
// column output, we must induce a transposition and perform C += A*B
// where B (formerly A) is real.
if ( col_pref )
{
bli_obj_swap( a, b );
bli_obj_induce_trans( a );
bli_obj_induce_trans( b );
bli_obj_induce_trans( c );
// We must swap the pack schemas because the schemas were set before
// the objects were swapped.
bli_obj_swap_pack_schemas( a, b );
return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
}
// Create a local copy of the context and then prepare to use this
// context instead of the one passed in.
*cntx_local = **cntx;
*cntx = cntx_local;
// Copy the real domain blocksizes into the slots of their complex
// counterparts.
blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
// Halve both the real and complex NR's (which are both real NR's).
bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nr );
bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nr );
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr );
// Halve both the real and complex NC's (which are both real NC's).
bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nc );
bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nc );
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
// Use the default pack schemas in the objects.
// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
// Rather than check which complex datatype dt_comp refers to, we set
// the mixed-domain virtual microkernel for both types.
bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_rcc
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
// We assume that the requested computation domain is complex.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_COMPLEX;
// For rcc, the computation (ukernel) will be real, and since the output
// matrix C is also real, so must be the execution domain.
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Create a local copy of the context and then prepare to use this
// context instead of the one passed in.
*cntx_local = **cntx;
*cntx = cntx_local;
// Copy the real domain blocksizes into the slots of their complex
// counterparts.
blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
// Halve both the real and complex KC's (which are both real KC's).
bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_kc );
bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_kc );
bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc );
bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc );
// Use the 1r pack schema for both A and B with the conjugation
// of A or B toggled (to produce ar * br - ai * bi).
bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a );
bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b );
bli_obj_toggle_conj( b );
// We also need to copy over the packm kernels from the 1m
// context. We query the address of that context here.
// NOTE: This is needed for situations where the rcc case does not
// involve any casting to different precisions, since currently
// bli_packm_blk_var1() is coded to hand off control to
// bli_packm_blk_var1_md() only when the storage datatype differs from
// the target datatype. (The packm_blk_var1_md() function has "built-in"
// support for packing to 1r (and 1e) schemas, whereas the
// packm_blk_var1() function relies on packm kernels for packing to 1r.
const num_t dt_complex = bli_obj_dt( a );
cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
func_t* cntx_funcs = bli_cntx_packm_kers_buf( *cntx );
func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m );
for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i )
{
cntx_funcs[ i ] = cntx_1m_funcs[ i ];
}
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_crr
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
obj_t c_real;
#endif
// We assume that the requested computation domain is real.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_REAL;
// For crr, the computation (ukernel) will be real, and since we will
// be updating only the real part of the output matrix C, the exectuion
// domain is also real.
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Since the A*B product is real, we can update only the real part of
// C. Thus, we convert the obj_t for the complex matrix to one that
// represents only the real part. HOWEVER, there are two situations in
// which we forgo this trick:
// - If extra memory optimizations are enabled, we should leave C alone
// since we'll be computing A*B to a temporary matrix and accumulating
// that result back to C, and in order for that to work, we need to
// allow that code to continue accessing C as a complex matrix.
// - Even if extra memory optimizations are diabled, logically projecting
// C as a real matrix can still cause problems if beta is non-unit. In
// that situation, the implementation won't get a chance to scale the
// imaginary components of C by beta, and thus it would compute the
// wrong answer. Thus, if beta is non-unit, we must leave C alone.
#ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
if ( bli_obj_equals( beta, &BLIS_ONE ) )
{
bli_obj_real_part( c, &c_real );
// Overwrite the complex obj_t with its real-only alias.
*c = c_real;
}
#endif
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_rcr
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
obj_t a_real;
// We assume that the requested computation domain is real.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_REAL;
// For rcr, the computation (ukernel) will be real, and since the output
// matrix C is also real, so must be the execution domain.
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Convert the obj_t for the complex matrix to one that represents only
// the real part.
bli_obj_real_part( a, &a_real );
// Overwrite the complex obj_t with its real-only alias.
*a = a_real;
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_rrc
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
obj_t b_real;
// We assume that the requested computation domain is real.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_REAL;
// For rcr, the computation (ukernel) will be real, and since the output
// matrix C is also real, so must be the execution domain.
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Convert the obj_t for the complex matrix to one that represents only
// the real part.
bli_obj_real_part( b, &b_real );
// Overwrite the complex obj_t with its real-only alias.
*b = b_real;
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_rrr
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
// We assume that the requested computation domain is real.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_REAL;
// For rrr, the computation (ukernel) and execution domains are both
// real.
doms.comp = BLIS_REAL;
doms.exec = BLIS_REAL;
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
// -----------------------------------------------------------------------------
// cab
mddm_t bli_gemm_md_ccc
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
)
{
mddm_t doms;
// We assume that the requested computation domain is complex.
//dom_t dom_comp_in = bli_obj_comp_domain( c );
//dom_t dom_comp_in = BLIS_COMPLEX;
// For ccc, the computation (ukernel) and execution domains are both
// complex.
doms.comp = BLIS_COMPLEX;
doms.exec = BLIS_COMPLEX;
// Use the default pack schemas in the objects.
// Return the computation and execution domains.
return doms;
}
#endif
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_md.h 0000664 0000000 0000000 00000022130 14634250137 0022412 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_gemm_md_c2r_ref.h"
// Define a local struct type that makes returning two values easier.
typedef struct mddm_s
{
dom_t comp;
dom_t exec;
} mddm_t;
void bli_gemm_md
(
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx_local,
cntx_t** cntx
);
mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
// -----------------------------------------------------------------------------
void bli_gemm_md_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
void bli_gemm_md_zgemm
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
// -----------------------------------------------------------------------------
BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c )
{
bool r_val = FALSE;
// NOTE: The last conditional subexpression is necessary if/when we
// allow the user to specify the computation domain. (The computation
// domain is currently ignored, but once it is honored as a user-
// settable value, it will affect the execution domain, which is what
// is checked below. Until then, the last expression is not actually
// necessary since crr is already unconditionally associated with an
// execution domain of BLIS_REAL.)
if ( bli_obj_is_complex( c ) &&
bli_obj_is_real( a ) &&
bli_obj_is_real( b ) &&
bli_obj_exec_domain( c ) == BLIS_REAL )
r_val = TRUE;
return r_val;
}
BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c )
{
bool r_val = FALSE;
// NOTE: The last conditional subexpression is necessary if/when we
// allow the user to specify the computation domain. (The computation
// domain is currently ignored, but once it is honored as a user-
// settable value, it will affect the execution domain, which is what
// is checked below. Until then, the last expression is not actually
// necessary since ccr is already unconditionally associated with an
// execution domain of BLIS_COMPLEX.)
if ( bli_obj_is_complex( c ) &&
bli_obj_is_complex( a ) &&
bli_obj_is_real( b ) &&
bli_obj_exec_domain( c ) == BLIS_COMPLEX )
r_val = TRUE;
return r_val;
}
BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c )
{
bool r_val = FALSE;
// NOTE: The last conditional subexpression is necessary if/when we
// allow the user to specify the computation domain. (The computation
// domain is currently ignored, but once it is honored as a user-
// settable value, it will affect the execution domain, which is what
// is checked below. Until then, the last expression is not actually
// necessary since crc is already unconditionally associated with an
// execution domain of BLIS_COMPLEX.)
if ( bli_obj_is_complex( c ) &&
bli_obj_is_real( a ) &&
bli_obj_is_complex( b ) &&
bli_obj_exec_domain( c ) == BLIS_COMPLEX )
r_val = TRUE;
return r_val;
}
// -----------------------------------------------------------------------------
BLIS_INLINE void bli_gemm_md_ker_var2_recast
(
num_t* dt_comp,
num_t dt_a,
num_t dt_b,
num_t* dt_c,
dim_t* m,
dim_t* n,
dim_t* k,
inc_t* pd_a, inc_t* ps_a,
inc_t* pd_b, inc_t* ps_b,
obj_t* c,
inc_t* rs_c, inc_t* cs_c
)
{
if ( bli_is_real( *dt_c ) &&
bli_is_complex( dt_a ) &&
bli_is_complex( dt_b ) )
{
// The rcc case is executed with a real macrokernel, so we need to
// double the k dimension (because both A and B are packed to the 1r
// schema), and also the panel strides of A and B since they were
// packed as complex matrices and we now need to convert them to
// units of real elements.
*k *= 2;
*ps_a *= 2;
*ps_b *= 2;
}
else if ( bli_is_complex( *dt_c ) &&
bli_is_real( dt_a ) &&
bli_is_complex( dt_b ) )
{
#if 1
obj_t beta;
bli_obj_scalar_detach( c, &beta );
if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&
bli_obj_imag_is_zero( &beta ) &&
bli_is_row_stored( *rs_c, *cs_c ) &&
bli_obj_prec( c ) == bli_obj_comp_prec( c ) )
{
// If beta is real, and C is not general-stored, and the computation
// precision is equal to the storage precision of C, we can use the
// real macrokernel (and real microkernel, which is already stored
// to the real virtual microkernel slots of the context) instead of
// the complex macrokernel and c2r virtual microkernel.
*dt_comp = bli_dt_proj_to_real( *dt_comp );
*dt_c = bli_dt_proj_to_real( *dt_c );
*n *= 2;
*pd_b *= 2; *ps_b *= 2;
*rs_c *= 2;
}
else
#endif
{
// Generally speaking, the crc case is executed with a complex
// macrokernel, so we need to halve the panel stride of A (which
// is real) since the macrokernel will perform the pointer
// arithmetic in units of complex elements.
*ps_a /= 2;
}
}
else if ( bli_is_complex( *dt_c ) &&
bli_is_complex( dt_a ) &&
bli_is_real( dt_b ) )
{
#if 1
obj_t beta;
bli_obj_scalar_detach( c, &beta );
if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) &&
bli_obj_imag_is_zero( &beta ) &&
bli_is_col_stored( *rs_c, *cs_c ) &&
bli_obj_prec( c ) == bli_obj_comp_prec( c ) )
{
// If beta is real, and C is not general-stored, and the computation
// precision is equal to the storage precision of C, we can use the
// real macrokernel (and real microkernel, which is already stored
// to the real virtual microkernel slots of the context) instead of
// the complex macrokernel and c2r virtual microkernel.
*dt_comp = bli_dt_proj_to_real( *dt_comp );
*dt_c = bli_dt_proj_to_real( *dt_c );
*m *= 2;
*pd_a *= 2; *ps_a *= 2;
*cs_c *= 2;
}
else
#endif
{
// Generally speaking, the ccr case is executed with a complex
// macrokernel, so we need to halve the panel stride of B (which
// is real) since the macrokernel will perform the pointer
// arithmetic in units of complex elements.
*ps_b /= 2;
}
}
#if 0
else if ( bli_is_real( dt_c ) &&
bli_is_real( dt_a ) &&
bli_is_real( dt_b ) )
{
// No action needed.
//printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k );
}
else if ( bli_is_complex( dt_c ) &&
bli_is_real( dt_a ) &&
bli_is_real( dt_b ) )
{
// No action needed.
}
else if ( bli_is_real( dt_c ) &&
bli_is_complex( dt_a ) &&
bli_is_real( dt_b ) )
{
// No action needed.
}
else if ( bli_is_real( dt_c ) &&
bli_is_real( dt_a ) &&
bli_is_complex( dt_b ) )
{
// No action needed.
}
#endif
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_md_c2r_ref.c 0000664 0000000 0000000 00000020524 14634250137 0024014 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_GEMM_MD
#undef GENTFUNCCO
#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \
\
void PASTEMAC2(ch,opname,suf) \
( \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* restrict alpha, \
ctype* restrict a, \
ctype* restrict b, \
ctype* restrict beta, \
ctype* restrict c, inc_t rs_c, inc_t cs_c, \
auxinfo_t* restrict data, \
cntx_t* restrict cntx \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
const num_t dt_r = PASTEMAC(chr,type); \
\
PASTECH(chr,gemm_ukr_ft) \
rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
const bool row_pref = !col_pref; \
\
const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
\
dim_t mr_r = mr; \
dim_t nr_r = nr; \
\
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype_r ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
inc_t rs_ct; \
inc_t cs_ct; \
\
ctype_r* restrict a_r = ( ctype_r* )a; \
\
ctype_r* restrict b_r = ( ctype_r* )b; \
\
ctype_r* restrict zero_r = PASTEMAC(chr,0); \
\
ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \
/*
ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \
*/ \
\
ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \
ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \
\
dim_t m_use; \
dim_t n_use; \
\
ctype_r* c_use; \
inc_t rs_c_use; \
inc_t cs_c_use; \
\
bool using_ct; \
\
/* This virtual microkernel is used by ccr and crc mixed-domain cases
when any of the following conditions are met:
- beta is complex (ie: has a non-zero imaginary component)
- C is general-stored
- the computation precision differs from the storage of C
If, however, none of the above conditions are met, then the real
domain macrokernel can be (and will be) called instead of calling
the complex macrokernel (and this virtual microkernel). */ \
\
/*
PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: a", mr, k, \
a_r, 1, mr, "%5.2f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: b", k, nr, \
b_r, nr, 1, "%5.2f", "" ); \
PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \
c_use, rs_c_use, cs_c_use, "%5.2f", "" ); \
*/ \
\
/* SAFETY CHECK: The higher level implementation should never
allow an alpha with non-zero imaginary component to be passed
in, because it can't be applied properly using the 1m method.
If alpha is not real, then something is very wrong. */ \
/*
if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
*/ \
\
/* If beta has a non-zero imaginary component OR if c is stored with
general stride, then we compute the alpha*a*b product into temporary
storage and then accumulate that result into c afterwards. Note that
the other two cases concerning disagreement between the storage of C
and the output preference of the micro-kernel, should ONLY occur in
the context of trsm, whereby this virtual micro-kernel is called
directly from the trsm macro-kernel to update the micro-tile b11
that exists within the packed row-panel of B. Indeed that is the
reason those cases MUST be explicitly handled. */ \
if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \
else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \
else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \
else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \
else using_ct = FALSE; \
\
\
if ( using_ct ) \
{ \
/* In the atypical cases, we compute the result into temporary
workspace ct and then accumulate it back to c at the end. */ \
\
/* Set the strides of ct based on the preference of the underlying
native real domain gemm micro-kernel. Note that we set the ct
strides in units of complex elements. */ \
if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \
else { rs_ct = nr; cs_ct = 1; } \
\
c_use = ( ctype_r* )ct; \
rs_c_use = rs_ct; \
cs_c_use = cs_ct; \
\
/* Convert the strides and corresponding microtile dimension from being
in units of complex elements to be in units of real elements. */ \
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; mr_r *= 2; } \
else { rs_c_use *= 2; nr_r *= 2; }\
\
/* c = beta * c + alpha_r * a * b; */ \
rgemm_ukr \
( \
mr_r, \
nr_r, \
k, \
alpha_r, \
a_r, \
b_r, \
zero_r, \
c_use, rs_c_use, cs_c_use, \
data, \
cntx \
); \
\
/* Accumulate the final result in ct back to c. */ \
if ( PASTEMAC(ch,eq1)( *beta ) ) \
{ \
for ( dim_t j = 0; j < n; ++j ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
*(c + i*rs_c + j*cs_c ) ); \
} \
} \
else if ( PASTEMAC(ch,eq0)( *beta ) ) \
{ \
for ( dim_t j = 0; j < n; ++j ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
*(c + i*rs_c + j*cs_c ) ); \
} \
} \
else \
{ \
for ( dim_t j = 0; j < n; ++j ) \
for ( dim_t i = 0; i < m; ++i ) \
{ \
PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
*beta, \
*(c + i*rs_c + j*cs_c ) ); \
} \
} \
} \
else \
{ \
/* In the typical cases, we use the real part of beta and
accumulate directly into the output matrix c. */ \
\
c_use = ( ctype_r* )c; \
rs_c_use = rs_c; \
cs_c_use = cs_c; \
m_use = m; \
n_use = n; \
\
/* Convert the strides and corresponding microtile dimension from being
in units of complex elements to be in units of real elements. */ \
if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; m_use *= 2; } \
else { rs_c_use *= 2; n_use *= 2; } \
\
/* c = beta * c + alpha_r * a * b; */ \
rgemm_ukr \
( \
m_use, \
n_use, \
k, \
alpha_r, \
a_r, \
b_r, \
beta_r, \
c_use, rs_c_use, cs_c_use, \
data, \
cntx \
); \
} \
}
INSERT_GENTFUNCCO_BASIC( gemm_md_c2r, BLIS_REF_SUFFIX )
#endif
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_md_c2r_ref.h 0000664 0000000 0000000 00000003543 14634250137 0024023 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- Level-3 native micro-kernel prototype redefinitions ----------------------
#undef gemm_ukr_name
#define gemm_ukr_name gemm_md_c2r_ref
// Include the native micro-kernel API template.
#include "bli_l3_ukr.h"
cython-blis-1.0.0/blis/_src/frame/3/gemm/bli_gemm_var.h 0000664 0000000 0000000 00000004312 14634250137 0022604 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// gemm kernel parameter struct.
//
typedef struct
{
gemm_ukr_vft ukr;
} gemm_ker_params_t;
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENPROT( gemm_blk_var1 )
GENPROT( gemm_blk_var2 )
GENPROT( gemm_blk_var3 )
GENPROT( gemm_ker_var1 )
GENPROT( gemm_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/gemm/ind/ 0000775 0000000 0000000 00000000000 14634250137 0020562 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/gemm/ind/bli_gemm_ind_opt.h 0000664 0000000 0000000 00000005307 14634250137 0024227 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_INLINE void bli_gemm_ind_recast_1m_params
(
num_t* dt_exec,
num_t* dt_c,
pack_t schema_a,
obj_t* c,
dim_t* m,
dim_t* n,
dim_t* k,
inc_t* pd_a, inc_t* ps_a,
inc_t* pd_b, inc_t* ps_b,
inc_t* rs_c, inc_t* cs_c
)
{
obj_t beta;
/* Detach the beta scalar from c so that we can test its imaginary
component. */
bli_obj_scalar_detach( c, &beta );
/* If beta is in the real domain, and c is row- or column-stored,
then we may proceed with the optimization. */
if ( bli_obj_imag_is_zero( &beta ) &&
!bli_is_gen_stored( *rs_c, *cs_c ) )
{
*dt_exec = bli_dt_proj_to_real( *dt_exec );
*dt_c = bli_dt_proj_to_real( *dt_c );
if ( bli_is_1e_packed( schema_a ) )
{
*m *= 2;
*n *= 1;
*k *= 2;
*pd_a *= 2; *ps_a *= 2;
*pd_b *= 1; *ps_b *= 2;
*rs_c *= 1; *cs_c *= 2;
}
else /* if ( bli_is_1r_packed( schema_a ) ) */
{
*m *= 1;
*n *= 2;
*k *= 2;
*pd_a *= 1; *ps_a *= 2;
*pd_b *= 2; *ps_b *= 2;
*rs_c *= 2; *cs_c *= 1;
}
}
}
cython-blis-1.0.0/blis/_src/frame/3/gemm/other/ 0000775 0000000 0000000 00000000000 14634250137 0021131 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/gemm/other/bli_gemm_ker_var2.c 0000664 0000000 0000000 00000025101 14634250137 0024642 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2);
void bli_gemm_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
#if 1
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/gemm/other/bli_gemm_ker_var2rr.c 0000664 0000000 0000000 00000026226 14634250137 0025217 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_gemm_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/gemm/other/bli_gemm_ker_var2sl.c 0000664 0000000 0000000 00000026220 14634250137 0025204 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_gemm_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// If 1m is being employed on a column- or row-stored matrix with a
// real-valued beta, we can use the real domain macro-kernel, which
// eliminates a little overhead associated with the 1m virtual
// micro-kernel.
if ( bli_is_1m_packed( schema_a ) )
{
bli_l3_ind_recast_1m_params
(
dt_exec,
schema_a,
c,
m, n, k,
pd_a, ps_a,
pd_b, ps_b,
rs_c, cs_c
);
}
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Determine the thread range and increment for each thrinfo_t node. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/gemm/other/bli_gemm_ker_var5.c 0000664 0000000 0000000 00000023736 14634250137 0024661 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)(
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
void_fp gemm_ukr
);
static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5);
void bli_gemm_ker_var5( obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread )
{
num_t dt_exec = bli_obj_exec_dt( c );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
func_t* gemm_ukrs;
void_fp gemm_ukr;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Extract from the context the func_t object containing
// the gemm micro-kernel function addresses, and then query the
// function address corresponding to the current datatype.
gemm_ukrs = bli_cntx_get_l3_ukr( BLIS_GEMM_UKR, cntx );
gemm_ukr = bli_func_get_dt( dt_exec, gemm_ukrs );
// Invoke the function.
f( m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
gemm_ukr );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname, ukrtype ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void_fp gemm_ukr \
) \
{ \
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \
\
/* Temporary buffer for incremental packing of B. */ \
ctype bp[ PASTEMAC(ch,maxkc) * \
/* !!!! NOTE: This packnr actually needs to be something like maxpacknr
if it is to be guaranteed to work in all situations !!!! The right
place to define maxpackmr/nr would be in bli_kernel_post_macro_defs.h */ \
PASTEMAC(ch,packnr) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
\
/* Temporary C buffer for edge cases. */ \
ctype ct[ PASTEMAC(ch,maxmr) * \
PASTEMAC(ch,maxnr) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const inc_t rs_ct = 1; \
const inc_t cs_ct = PASTEMAC(ch,maxmr); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKNR = rs_b; \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
ctype* restrict b2; \
\
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t i, j; \
dim_t m_cur; \
dim_t n_cur; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the panel strides of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_ps_a( ps_a, &aux ); \
bli_auxinfo_set_ps_b( ps_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Since we pack micro-panels of B incrementaly, one at a time, the
address of the next micro-panel of B remains constant. */ \
b2 = bp; \
\
/* Save address of next panel of B to the auxinfo_t object. */ \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Incrementally pack a single micro-panel of B. */ \
PASTEMAC(ch,packm_cxk)( BLIS_NO_CONJUGATE, \
n_cur, \
k, \
one, \
b1, 1, rs_b, \
bp, PACKNR ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter ) ) \
{ \
a2 = a_cast; \
} \
\
/* Save address of next panel of A to the auxinfo_t object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
bp, \
beta_cast, \
c11, rs_c, cs_c, \
&aux ); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr_cast( k, \
alpha_cast, \
a1, \
bp, \
zero, \
ct, rs_ct, cs_ct, \
&aux ); \
\
/* Scale the bottom edge of C and add the result from above. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC( gemm_ker_var5, gemm_ukr_t )
cython-blis-1.0.0/blis/_src/frame/3/gemm/other/bli_gemm_ker_var5.h 0000664 0000000 0000000 00000005073 14634250137 0024660 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interface.
//
void bli_gemm_ker_var5( obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
gemm_t* cntl,
gemm_thrinfo_t* thread );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname)( \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
void_fp gemm_ukr \
);
INSERT_GENTPROT_BASIC( gemm_ker_var5 )
cython-blis-1.0.0/blis/_src/frame/3/gemmt/ 0000775 0000000 0000000 00000000000 14634250137 0020174 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt.h 0000664 0000000 0000000 00000003276 14634250137 0022314 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_gemmt_front.h"
#include "bli_gemmt_var.h"
cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt_front.c 0000664 0000000 0000000 00000010115 14634250137 0023505 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_gemmt_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If C has a zero dimension, return early.
if ( bli_obj_has_zero_dim( c ) )
{
return;
}
// If alpha is zero, or if A or B has a zero dimension, scale C by beta
// and return early.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
bli_obj_has_zero_dim( a ) ||
bli_obj_has_zero_dim( b ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_obj_swap( &a_local, &b_local );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// Set the pack schemas within the objects, as appropriate.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_GEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm/gemmt
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end via the thread handler.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_GEMMT, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}
cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt_front.h 0000664 0000000 0000000 00000003600 14634250137 0023513 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_gemmt_front
(
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt_l_ker_var2.c 0000664 0000000 0000000 00000037610 14634250137 0024414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmt_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
void bli_gemmt_l_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, ip; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely above the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region above where the diagonal of C intersects
the left edge of the panel, adjust the pointer to C and A and treat
this case as if the diagonal offset were zero. */ \
if ( diagoffc < 0 ) \
{ \
ip = -diagoffc / MR; \
i = ip * MR; \
m = m - i; \
diagoffc = -diagoffc % MR; \
c_cast = c_cast + (i )*rs_c; \
a_cast = a_cast + (ip )*ps_a; \
} \
\
/* If there is a zero region to the right of where the diagonal
of C intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffc + m < n ) \
{ \
n = diagoffc + m; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of C, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. Any remainder from this integer division is discarded, which
is what we want. That is, we want the rectangular region to contain
as many columns of whole microtiles as possible without including any
microtiles that intersect the diagonal. The number of iterations in
the triangular (or trapezoidal) region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_rct = diagoffc / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Determine the thread range and increment for the 2nd and 1st loops for
the initial rectangular region of C (if it exists).
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
and the default (slab or rr) partitioning in the 1st loop for the
remaining triangular region of C. */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the triangular region
by the number of iterations used for the rectangular region. */ \
jr_start += n_iter_rct; \
jr_end += n_iter_rct; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
MR, \
NR, \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt_u_ker_var2.c 0000664 0000000 0000000 00000040164 14634250137 0024423 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmt_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
void bli_gemmt_u_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, jp; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely below the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero.
NOTE: It's possible that after this pruning that the diagonal offset
is still positive (though it is guaranteed to be less than NR). */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
j = jp * NR; \
n = n - j; \
diagoffc = diagoffc % NR; \
c_cast = c_cast + (j )*cs_c; \
b_cast = b_cast + (jp )*ps_b; \
} \
\
/* If there is a zero region below where the diagonal of C intersects
the right edge of the panel, shrink it to prevent "no-op" iterations
from executing. */ \
if ( -diagoffc + n < m ) \
{ \
m = -diagoffc + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
/* Save the desired output datatype (indicating no typecasting). */ \
/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
{ \
/* If the entire panel of C does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of C does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in C. A non-zero remainder means we need to
add one additional iteration. That is, we want the triangular region
to contain as few columns of whole microtiles as possible while still
including all microtiles that intersect the diagonal. The number of
iterations in the rectangular region is computed as the remaining
number of iterations in the n dimension. */ \
n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
and the default (slab or rr) partitioning in the 1st loop for the
initial triangular region of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
MR, \
NR, \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Determine the thread range and increment for the 2nd loop of the
remaining rectangular region of C (and also use default partitioning
for the 1st loop).
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* No need to compute the diagonal offset for the rectangular
region. */ \
/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt_var.h 0000664 0000000 0000000 00000005450 14634250137 0023160 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* ah, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENPROT( gemmt_x_ker_var2 )
GENPROT( gemmt_l_ker_var2 )
GENPROT( gemmt_u_ker_var2 )
//
// Prototype BLAS-like interfaces with void pointer operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 )
INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/gemmt/bli_gemmt_x_ker_var2.c 0000664 0000000 0000000 00000004522 14634250137 0024424 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static l3_var_oft vars[2] =
{
bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2,
};
void bli_gemmt_x_ker_var2
(
obj_t* a,
obj_t* ah,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
dim_t uplo;
l3_var_oft f;
// Set a bool based on the uplo field of C's root object.
if ( bli_obj_root_is_lower( c ) ) uplo = 0;
else uplo = 1;
// Index into the variant array to extract the correct function pointer.
f = vars[uplo];
// Call the macrokernel.
f
(
a,
ah,
c,
cntx,
rntm,
cntl,
thread
);
}
cython-blis-1.0.0/blis/_src/frame/3/gemmt/other/ 0000775 0000000 0000000 00000000000 14634250137 0021315 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c 0000664 0000000 0000000 00000030061 14634250137 0025526 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmt_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
void bli_gemmt_l_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, ip; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely above the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region above where the diagonal of C intersects
the left edge of the panel, adjust the pointer to C and A and treat
this case as if the diagonal offset were zero. */ \
if ( diagoffc < 0 ) \
{ \
ip = -diagoffc / MR; \
i = ip * MR; \
m = m - i; \
diagoffc = -diagoffc % MR; \
c_cast = c_cast + (i )*rs_c; \
a_cast = a_cast + (ip )*ps_a; \
} \
\
/* If there is a zero region to the right of where the diagonal
of C intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffc + m < n ) \
{ \
n = diagoffc + m; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly below the diagonal,
we compute and store as we normally would.
And if we're strictly above the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c 0000664 0000000 0000000 00000030063 14634250137 0025541 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmt_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffc,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, inc_t is_a,
dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, inc_t is_b,
dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
void bli_gemmt_u_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffc = bli_obj_diag_offset( c );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
inc_t is_a = bli_obj_imag_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t is_b = bli_obj_imag_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffc,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, is_a,
pd_a, ps_a,
buf_b, rs_b, is_b,
pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffc, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, inc_t is_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, inc_t is_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
/*const dim_t PACKMR = cs_a;*/ \
/*const dim_t PACKNR = rs_b;*/ \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffc_ij; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t i, j, jp; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of C is entirely below the diagonal,
it is not stored. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
\
/* If there is a zero region to the left of where the diagonal of C
intersects the top edge of the panel, adjust the pointer to C and B
and treat this case as if the diagonal offset were zero. */ \
if ( diagoffc > 0 ) \
{ \
jp = diagoffc / NR; \
j = jp * NR; \
n = n - j; \
diagoffc = diagoffc % NR; \
c_cast = c_cast + (j )*cs_c; \
b_cast = b_cast + (jp )*ps_b; \
} \
\
/* If there is a zero region below where the diagonal of C intersects
the right edge of the panel, shrink it to prevent "no-op" iterations
from executing. */ \
if ( -diagoffc + n < m ) \
{ \
m = -diagoffc + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( is_a, &aux ); \
bli_auxinfo_set_is_b( is_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
dim_t jr_num_threads = bli_thread_n_way( thread ); \
dim_t jr_thread_id = bli_thread_work_id( thread ); \
dim_t ir_num_threads = bli_thread_n_way( caucus ); \
dim_t ir_thread_id = bli_thread_work_id( caucus ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Interior loop over the m dimension (MR rows at a time). */ \
for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
/* Compute the diagonal offset for the submatrix at (i,j). */ \
diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \
if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \
{ \
a2 = a_cast; \
b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* If the diagonal intersects the current MR x NR submatrix, we
compute it the temporary buffer and then add in the elements
on or below the diagonal.
Otherwise, if the submatrix is strictly above the diagonal,
we compute and store as we normally would.
And if we're strictly below the diagonal, we do nothing and
continue. */ \
if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale C and add the result to only the stored part. */ \
PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
{ \
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Scale the edge of C and add the result. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
beta_cast, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
}
INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/hemm/ 0000775 0000000 0000000 00000000000 14634250137 0020011 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/hemm/bli_hemm.h 0000664 0000000 0000000 00000003246 14634250137 0021743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_hemm_front.h"
cython-blis-1.0.0/blis/_src/frame/3/hemm/bli_hemm_front.c 0000664 0000000 0000000 00000013766 14634250137 0023156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_hemm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
#ifdef BLIS_DISABLE_HEMM_RIGHT
// NOTE: This case casts right-side hemm in terms of left side. This is
// necessary when the current subconfiguration uses a gemm microkernel
// that assumes that the packing kernel will have already duplicated
// (broadcast) element of B in the packed copy of B. Supporting
// duplication within the logic that packs micropanels from Hermitian/
// matrices would be ugly, and so we simply don't support it. As a
// consequence, those subconfigurations need a way to force the Hermitian
// matrix to be on the left (and thus the general matrix to the on the
// right). So our solution is that in those cases, the subconfigurations
// simply #define BLIS_DISABLE_HEMM_RIGHT.
// NOTE: This case casts right-side hemm in terms of left side. This can
// lead to the microkernel being executed on an output matrix with the
// microkernel's general stride IO case (unless the microkernel supports
// both both row and column IO cases as well).
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// NOTE: This case computes right-side hemm/symm natively by packing
// elements of the Hermitian/symmetric matrix A to micropanels of the
// right-hand packed matrix operand "B", and elements of the general
// matrix B to micropanels of the left-hand packed matrix operand "A".
// This code path always gives us the opportunity to transpose the
// entire operation so that the effective storage format of the output
// matrix matches the microkernel's output preference. Thus, from a
// performance perspective, this case is preferred.
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
// be enabled. See issue #342 comments.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_toggle_side( &side );
bli_obj_toggle_conj( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// If the Hermitian/symmetric matrix A is being multiplied from the right,
// swap A and B so that the Hermitian/symmetric matrix will actually be on
// the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_HEMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}
cython-blis-1.0.0/blis/_src/frame/3/hemm/bli_hemm_front.h 0000664 0000000 0000000 00000003540 14634250137 0023150 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_hemm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/3/old/ 0000775 0000000 0000000 00000000000 14634250137 0017641 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/old/bli_l3_ft_ex.h 0000664 0000000 0000000 00000013470 14634250137 0022350 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_L3_FT_EX_H
#define BLIS_L3_FT_EX_H
//
// -- Level-3 expert function types --------------------------------------------
//
// gemm
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t n, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEF( gemm )
// hemm, symm
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
side_t side, \
uplo_t uploa, \
conj_t conja, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEF( hemm )
INSERT_GENTDEF( symm )
// herk
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype_r* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEFR( herk )
// her2k
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype_r* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEFR( her2k )
// syrk
#undef GENTDEFR
#define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
uplo_t uploc, \
trans_t transa, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEFR( syrk )
// syr2k
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
uplo_t uploc, \
trans_t transa, \
trans_t transb, \
dim_t m, \
dim_t k, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEF( syr2k )
// trmm3
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
trans_t transb, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
ctype* beta, \
ctype* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEF( trmm3 )
// trmm
#undef GENTDEF
#define GENTDEF( ctype, ch, opname, tsuf ) \
\
typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \
( \
side_t side, \
uplo_t uploa, \
trans_t transa, \
diag_t diaga, \
dim_t m, \
dim_t n, \
ctype* alpha, \
ctype* a, inc_t rs_a, inc_t cs_a, \
ctype* b, inc_t rs_b, inc_t cs_b, \
cntx_t* cntx, \
rntm_t* rntm \
);
INSERT_GENTDEF( trmm )
INSERT_GENTDEF( trsm )
#endif
cython-blis-1.0.0/blis/_src/frame/3/old/bli_l3_sup_edge.h 0000664 0000000 0000000 00000007652 14634250137 0023043 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static
void bli_dgemmsup_ker_edge_dispatcher
(
conj_t conja,
conj_t conjb,
dim_t m0,
dim_t n0,
dim_t k0,
double* restrict alpha,
double* restrict a, inc_t rs_a0, inc_t cs_a0,
double* restrict b, inc_t rs_b0, inc_t cs_b0,
double* restrict beta,
double* restrict c, inc_t rs_c0, inc_t cs_c0,
auxinfo_t* restrict data,
cntx_t* restrict cntx,
const dim_t num_mr,
const dim_t num_nr,
dim_t* restrict mrs,
dim_t* restrict nrs,
dgemmsup_ker_ft* kmap
)
{
#if 1
// outer loop = mr; inner loop = nr
dim_t n_left = n0;
double* restrict cj = c;
double* restrict bj = b;
for ( dim_t j = 0; n_left != 0; ++j )
{
const dim_t nr_cur = nrs[ j ];
if ( nr_cur <= n_left )
{
dim_t m_left = m0;
double* restrict cij = cj;
double* restrict ai = a;
for ( dim_t i = 0; m_left != 0; ++i )
{
const dim_t mr_cur = mrs[ i ];
if ( mr_cur <= m_left )
{
dgemmsup_ker_ft ker_fp = kmap[ i*num_nr + j*1 ];
ker_fp
(
conja, conjb, mr_cur, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx
);
cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
}
}
cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
}
}
#else
// outer loop = nr; inner loop = mr
dim_t m_left = m0;
double* restrict ci = c;
double* restrict ai = a;
for ( dim_t i = 0; m_left != 0; ++i )
{
const dim_t mr_cur = mrs[ i ];
if ( mr_cur <= m_left )
{
dim_t n_left = n0;
double* restrict cij = ci;
double* restrict bj = b;
for ( dim_t j = 0; n_left != 0; ++j )
{
const dim_t nr_cur = nrs[ j ];
if ( nr_cur <= n_left )
{
dgemmsup_ker_ft ker_fp = kmap[ i*num_nr + j*1 ];
ker_fp
(
conja, conjb, mr_cur, nr_cur, k0,
alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
beta, cij, rs_c0, cs_c0, data, cntx
);
cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
}
}
ci += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
}
}
#endif
}
cython-blis-1.0.0/blis/_src/frame/3/old/bli_l3_sup_var1n2m.c 0000664 0000000 0000000 00000060260 14634250137 0023412 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemmsup_fp
typedef void (*FUNCPTR_T)
(
conj_t conja,
conj_t conjb,
dim_t m,
dim_t n,
dim_t k,
void* restrict alpha,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b,
void* restrict beta,
void* restrict c, inc_t rs_c, inc_t cs_c,
stor3_t eff_id,
cntx_t* restrict cntx,
rntm_t* restrict rntm,
cntl_t* restrict cntl,
thrinfo_t* restrict thread
);
//
// -- var1n --------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
void bli_gemmsup_ref_var1n
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#else
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var1n[dt_exec];
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
cntl,
thread
);
}
else
{
// Invoke the function (transposing the operation).
f
(
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
m,
k,
buf_alpha,
buf_b, cs_b, rs_b, // swap the positions of A and B.
buf_a, cs_a, rs_a, // swap the strides of A and B.
buf_beta,
buf_c, cs_c, rs_c, // swap the strides of C.
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
cntx,
rntm,
cntl,
thread
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
cntl_t* restrict cntl, \
thrinfo_t* restrict thread \
) \
{ \
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* This transposition of the stor3_t id value is inherent to variant 1.
The reason: we assume that variant 2 is the "main" variant. The
consequence of this is that we assume that the millikernels that
iterate over m are registered to the kernel group associated with
the kernel preference. So, regardless of whether the mkernels are
row- or column-preferential, millikernels that iterate over n are
always placed in the slots for the opposite kernel group. */ \
stor_id = bli_stor3_trans( stor_id ); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
dim_t KC; \
if ( FALSE ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( m <= MR && n <= NR ) KC = KC0; \
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
\
/* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \
const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \
const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \
\
/* Query the maximum blocksize for MR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
const dim_t MRE = MRM - MR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = rs_c * NC; \
const inc_t jcstep_a = rs_a * NC; \
\
const inc_t pcstep_a = cs_a * KC; \
const inc_t pcstep_b = rs_b * KC; \
\
const inc_t icstep_c = cs_c * MC; \
const inc_t icstep_b = cs_b * MC; \
\
const inc_t jrstep_c = rs_c * MR; \
const inc_t jrstep_a = rs_a * MR; \
\
/*
const inc_t irstep_c = cs_c * NR; \
const inc_t irstep_b = cs_b * NR; \
*/ \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
ctype* restrict one = PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the outer
dimensions.
NOTE: Functionally speaking, we compute jc_iter as:
jc_iter = m / NC; if ( jc_left ) ++jc_iter;
However, this is implemented as:
jc_iter = ( m + NC - 1 ) / NC;
This avoids a branch at the cost of two additional integer instructions.
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
similar manner. */ \
const dim_t jc_iter = ( m + NC - 1 ) / NC; \
const dim_t jc_left = m % NC; \
\
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
const dim_t pc_left = k % KC; \
\
const dim_t ic_iter = ( n + MC - 1 ) / MC; \
const dim_t ic_left = n % MC; \
\
const dim_t jc_inc = 1; \
const dim_t pc_inc = 1; \
const dim_t ic_inc = 1; \
const dim_t jr_inc = 1; \
/*
const dim_t ir_inc = 1; \
*/ \
\
/* Loop over the m dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
{ \
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
\
ctype* restrict a_jc = a_00 + jj * jcstep_a; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
dim_t jr_left = nc_cur % MR; \
\
/* An optimization: allow the last jr iteration to contain up to MRE
rows of C and A. (If MRE > MR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. */ \
if ( 1 ) \
if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
{ \
jr_iter--; jr_left += MR; \
} \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
{ \
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
\
ctype* restrict a_pc = a_jc + pp * pcstep_a; \
ctype* restrict b_pc = b_00 + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
\
/* Loop over the n dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
{ \
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
\
ctype* restrict b_ic = b_pc + ii * icstep_b; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
/*
const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
const dim_t ir_left = mc_cur % NR; \
*/ \
\
/* Loop over the m dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
\
ctype* restrict a_jr = a_pc + j * jrstep_a; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Loop over the n dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
nr_cur, /* Notice: nr_cur <= MR. */ \
mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
kc_cur, \
alpha_cast, \
a_jr, rs_a, cs_a, \
b_ic, rs_b, cs_b, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
//
// -- var2m --------------------------------------------------------------------
//
static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
void bli_gemmsup_ref_var2m
(
trans_t trans,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
stor3_t eff_id,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
#if 0
obj_t at, bt;
bli_obj_alias_to( a, &at );
bli_obj_alias_to( b, &bt );
// Induce transpositions on A and/or B if either object is marked for
// transposition. We can induce "fast" transpositions since they objects
// are guaranteed to not have structure or be packed.
if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
const dim_t k = bli_obj_width( &at );
void* restrict buf_a = bli_obj_buffer_at_off( &at );
const inc_t rs_a = bli_obj_row_stride( &at );
const inc_t cs_a = bli_obj_col_stride( &at );
void* restrict buf_b = bli_obj_buffer_at_off( &bt );
const inc_t rs_b = bli_obj_row_stride( &bt );
const inc_t cs_b = bli_obj_col_stride( &bt );
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#else
const num_t dt_exec = bli_obj_dt( c );
const conj_t conja = bli_obj_conj_status( a );
const conj_t conjb = bli_obj_conj_status( b );
const dim_t m = bli_obj_length( c );
const dim_t n = bli_obj_width( c );
dim_t k;
void* restrict buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a;
inc_t cs_a;
void* restrict buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b;
inc_t cs_b;
if ( bli_obj_has_notrans( a ) )
{
k = bli_obj_width( a );
rs_a = bli_obj_row_stride( a );
cs_a = bli_obj_col_stride( a );
}
else // if ( bli_obj_has_trans( a ) )
{
// Assign the variables with an implicit transposition.
k = bli_obj_length( a );
rs_a = bli_obj_col_stride( a );
cs_a = bli_obj_row_stride( a );
}
if ( bli_obj_has_notrans( b ) )
{
rs_b = bli_obj_row_stride( b );
cs_b = bli_obj_col_stride( b );
}
else // if ( bli_obj_has_trans( b ) )
{
// Assign the variables with an implicit transposition.
rs_b = bli_obj_col_stride( b );
cs_b = bli_obj_row_stride( b );
}
void* restrict buf_c = bli_obj_buffer_at_off( c );
const inc_t rs_c = bli_obj_row_stride( c );
const inc_t cs_c = bli_obj_col_stride( c );
void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta );
#endif
// Index into the type combination array to extract the correct
// function pointer.
FUNCPTR_T f = ftypes_var2m[dt_exec];
if ( bli_is_notrans( trans ) )
{
// Invoke the function.
f
(
conja,
conjb,
m,
n,
k,
buf_alpha,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b,
buf_beta,
buf_c, rs_c, cs_c,
eff_id,
cntx,
rntm,
cntl,
thread
);
}
else
{
// Invoke the function (transposing the operation).
f
(
conjb, // swap the conj values.
conja,
n, // swap the m and n dimensions.
m,
k,
buf_alpha,
buf_b, cs_b, rs_b, // swap the positions of A and B.
buf_a, cs_a, rs_a, // swap the strides of A and B.
buf_beta,
buf_c, cs_c, rs_c, // swap the strides of C.
bli_stor3_trans( eff_id ), // transpose the stor3_t id.
cntx,
rntm,
cntl,
thread
);
}
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
conj_t conja, \
conj_t conjb, \
dim_t m, \
dim_t n, \
dim_t k, \
void* restrict alpha, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b, \
void* restrict beta, \
void* restrict c, inc_t rs_c, inc_t cs_c, \
stor3_t stor_id, \
cntx_t* restrict cntx, \
rntm_t* restrict rntm, \
cntl_t* restrict cntl, \
thrinfo_t* restrict thread \
) \
{ \
/* If m or n is zero, return immediately. */ \
if ( bli_zero_dim2( m, n ) ) return; \
\
/* If k < 1 or alpha is zero, scale by beta and return. */ \
if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
{ \
PASTEMAC(ch,scalm) \
( \
BLIS_NO_CONJUGATE, \
0, \
BLIS_NONUNIT_DIAG, \
BLIS_DENSE, \
m, n, \
beta, \
c, rs_c, cs_c \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
/* Query the context for various blocksizes. */ \
const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
\
dim_t KC; \
if ( stor_id == BLIS_RRR || \
stor_id == BLIS_CCC ) KC = KC0; \
else if ( stor_id == BLIS_RRC || \
stor_id == BLIS_CRC ) KC = KC0; \
else if ( m <= MR && n <= NR ) KC = KC0; \
else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
else KC = (( KC0 / 5 ) / 4 ) * 4; \
\
/* Query the maximum blocksize for NR, which implies a maximum blocksize
extension for the final iteration. */ \
const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
const dim_t NRE = NRM - NR; \
\
/* Compute partitioning step values for each matrix of each loop. */ \
const inc_t jcstep_c = cs_c * NC; \
const inc_t jcstep_b = cs_b * NC; \
\
const inc_t pcstep_a = cs_a * KC; \
const inc_t pcstep_b = rs_b * KC; \
\
const inc_t icstep_c = rs_c * MC; \
const inc_t icstep_a = rs_a * MC; \
\
const inc_t jrstep_c = cs_c * NR; \
const inc_t jrstep_b = cs_b * NR; \
\
/*
const inc_t irstep_c = rs_c * MR; \
const inc_t irstep_a = rs_a * MR; \
*/ \
\
/* Query the context for the sup microkernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemmsup_ker_ft) \
gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
\
ctype* restrict a_00 = a; \
ctype* restrict b_00 = b; \
ctype* restrict c_00 = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
\
ctype* restrict one = PASTEMAC(ch,1); \
\
auxinfo_t aux; \
\
/* Compute number of primary and leftover components of the outer
dimensions.
NOTE: Functionally speaking, we compute jc_iter as:
jc_iter = n / NC; if ( jc_left ) ++jc_iter;
However, this is implemented as:
jc_iter = ( n + NC - 1 ) / NC;
This avoids a branch at the cost of two additional integer instructions.
The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in
similar manner. */ \
const dim_t jc_iter = ( n + NC - 1 ) / NC; \
const dim_t jc_left = n % NC; \
\
const dim_t pc_iter = ( k + KC - 1 ) / KC; \
const dim_t pc_left = k % KC; \
\
const dim_t ic_iter = ( m + MC - 1 ) / MC; \
const dim_t ic_left = m % MC; \
\
const dim_t jc_inc = 1; \
const dim_t pc_inc = 1; \
const dim_t ic_inc = 1; \
const dim_t jr_inc = 1; \
/*
const dim_t ir_inc = 1; \
*/ \
\
/* Loop over the n dimension (NC rows/columns at a time). */ \
for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \
{ \
const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
\
ctype* restrict b_jc = b_00 + jj * jcstep_b; \
ctype* restrict c_jc = c_00 + jj * jcstep_c; \
\
dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
dim_t jr_left = nc_cur % NR; \
\
/* An optimization: allow the last jr iteration to contain up to NRE
columns of C and B. (If NRE > NR, the mkernel has agreed to handle
these cases.) Note that this prevents us from declaring jr_iter and
jr_left as const. */ \
if ( 1 ) \
if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
{ \
jr_iter--; jr_left += NR; \
} \
\
/* Loop over the k dimension (KC rows/columns at a time). */ \
for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \
{ \
const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
\
ctype* restrict a_pc = a_00 + pp * pcstep_a; \
ctype* restrict b_pc = b_jc + pp * pcstep_b; \
\
/* Only apply beta to the first iteration of the pc loop. */ \
ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
\
/* Loop over the m dimension (MC rows at a time). */ \
for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
{ \
const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
\
ctype* restrict a_ic = a_pc + ii * icstep_a; \
ctype* restrict c_ic = c_jc + ii * icstep_c; \
\
/*
const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
const dim_t ir_left = mc_cur % MR; \
*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \
{ \
const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
\
ctype* restrict b_jr = b_pc + j * jrstep_b; \
ctype* restrict c_jr = c_ic + j * jrstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
{ \
/* Invoke the gemmsup millikernel. */ \
gemmsup_ker \
( \
conja, \
conjb, \
mc_cur, \
nr_cur, \
kc_cur, \
alpha_cast, \
a_ic, rs_a, cs_a, \
b_jr, rs_b, cs_b, \
beta_use, \
c_jr, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
cython-blis-1.0.0/blis/_src/frame/3/symm/ 0000775 0000000 0000000 00000000000 14634250137 0020050 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/symm/bli_symm.h 0000664 0000000 0000000 00000003246 14634250137 0022041 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_symm_front.h"
cython-blis-1.0.0/blis/_src/frame/3/symm/bli_symm_front.c 0000664 0000000 0000000 00000013722 14634250137 0023244 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_symm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C in case we need to apply transformations.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
#ifdef BLIS_DISABLE_SYMM_RIGHT
// NOTE: This case casts right-side symm in terms of left side. This is
// necessary when the current subconfiguration uses a gemm microkernel
// that assumes that the packing kernel will have already duplicated
// (broadcast) element of B in the packed copy of B. Supporting
// duplication within the logic that packs micropanels from symmetric
// matrices would be ugly, and so we simply don't support it. As a
// consequence, those subconfigurations need a way to force the symmetric
// matrix to be on the left (and thus the general matrix to the on the
// right). So our solution is that in those cases, the subconfigurations
// simply #define BLIS_DISABLE_SYMM_RIGHT.
// NOTE: This case casts right-side symm in terms of left side. This can
// lead to the microkernel being executed on an output matrix with the
// microkernel's general stride IO case (unless the microkernel supports
// both both row and column IO cases as well).
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// NOTE: This case computes right-side hemm/symm natively by packing
// elements of the Hermitian/symmetric matrix A to micropanels of the
// right-hand packed matrix operand "B", and elements of the general
// matrix B to micropanels of the left-hand packed matrix operand "A".
// This code path always gives us the opportunity to transpose the
// entire operation so that the effective storage format of the output
// matrix matches the microkernel's output preference. Thus, from a
// performance perspective, this case is preferred.
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
// be enabled. See issue #342 comments.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// If the Hermitian/symmetric matrix A is being multiplied from the right,
// swap A and B so that the Hermitian/symmetric matrix will actually be on
// the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_SYMM,
BLIS_LEFT, // ignored for gemm/hemm/symm
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_GEMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}
cython-blis-1.0.0/blis/_src/frame/3/symm/bli_symm_front.h 0000664 0000000 0000000 00000003540 14634250137 0023246 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_symm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/3/trmm/ 0000775 0000000 0000000 00000000000 14634250137 0020042 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm.h 0000664 0000000 0000000 00000003301 14634250137 0022015 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_trmm_front.h"
#include "bli_trmm_var.h"
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_front.c 0000664 0000000 0000000 00000015553 14634250137 0023234 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( alpha, b );
return;
}
// Alias A and B so we can tweak the objects if necessary.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( b, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply induce a transposition. This
// allows us to only explicitly implement the no-transpose cases. Once
// the transposition is induced, the correct algorithm will be called,
// since, for example, an algorithm over a transposed lower triangular
// matrix A moves in the same direction (forwards) as a non-transposed
// upper triangular matrix. And with the transposition induced, the
// matrix now appears to be upper triangular, so the upper triangular
// algorithm will grab the correct partitions, as if it were upper
// triangular (with no transpose) all along.
if ( bli_obj_has_trans( &a_local ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
#ifdef BLIS_DISABLE_TRMM_RIGHT
// NOTE: This case casts right-side trmm in terms of left side. This is
// necessary when the current subconfiguration uses a gemm microkernel
// that assumes that the packing kernel will have already duplicated
// (broadcast) element of B in the packed copy of B. Supporting
// duplication within the logic that packs micropanels from triangular
// matrices would be ugly, and so we simply don't support it. As a
// consequence, those subconfigurations need a way to force the triangular
// matrix to be on the left (and thus the general matrix to the on the
// right). So our solution is that in those cases, the subconfigurations
// simply #define BLIS_DISABLE_TRMM_RIGHT.
// NOTE: This case casts right-side trmm in terms of left side. This can
// lead to the microkernel being executed on an output matrix with the
// microkernel's general stride IO case (unless the microkernel supports
// both both row and column IO cases as well).
// NOTE: Casting right-side trmm in terms of left side reduces the number
// of macrokernels exercised to two (trmm_ll and trmm_lu).
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// NOTE: This case computes right-side trmm natively with trmm_rl and
// trmm_ru macrokernels. This code path always gives us the opportunity
// to transpose the entire operation so that the effective storage format
// of the output matrix matches the microkernel's output preference.
// Thus, from a performance perspective, this case is preferred.
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
// NOTE: We disable the optimization for 1x1 matrices since the concept
// of row- vs. column storage breaks down.
//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
// be enabled. See issue #342 comments.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// If A is being multiplied from the right, swap A and B so that
// the matrix will actually be on the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_TRMM,
side,
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_TRMM, // operation family id
alpha,
&a_local,
&b_local,
&BLIS_ZERO,
&c_local,
cntx,
rntm,
cntl
);
}
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_front.h 0000664 0000000 0000000 00000003471 14634250137 0023235 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trmm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_ll_ker_var2.c 0000664 0000000 0000000 00000031370 14634250137 0024301 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = is_a_cur; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + off_a1011 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_lu_ker_var2.c 0000664 0000000 0000000 00000031626 14634250137 0024316 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + i * PACKNR; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = is_a_cur; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + off_a1112 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_rl_ker_var2.c 0000664 0000000 0000000 00000035462 14634250137 0024315 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + j * PACKMR; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of B, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. (There should never be any remainder in this division.) The
number of iterations in the triangular (or trapezoidal) region is
computed as the remaining number of iterations in the n dimension. */ \
n_iter_rct = diagoffb / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Determine the thread range and increment for the 2nd and 1st loops for
the initial rectangular region of B (if it exists).
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
{ \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and
1st loops for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
jr_start = n_iter_rct; \
b1 = b_cast + jr_start * cstep_b; \
c1 = c_cast + jr_start * cstep_c; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = is_b_cur; \
\
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + off_b1121 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_ru_ker_var2.c 0000664 0000000 0000000 00000037320 14634250137 0024321 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j, jb0; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in B. (There should never be any remainder
in this division.) The number of iterations in the rectangular region
is computed as the remaining number of iterations in the n dimension. */ \
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and
1st loops for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter_tri; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = is_b_cur; \
\
if ( bli_trmm_my_iter_rr( j, thread ) ) { \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + off_b0111 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Determine the thread range and increment for the 2nd and 1st loops for
the remaining rectangular region of B.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time. \
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
jb0 = n_iter_tri; \
\
/* Save the resulting value of b1 from the previous loop since it represents
the starting point for the rectangular region. */ \
b_cast = b1; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
/* NOTE: We must index through b_cast differently since it contains
the starting address of the rectangular region (which is already
n_iter_tri logical iterations through B). */ \
b1 = b_cast + (j-jb0) * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
} \
} \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_var.h 0000664 0000000 0000000 00000005756 14634250137 0022705 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
//GENPROT( trmm_blk_var1 )
//GENPROT( trmm_blk_var2 )
//GENPROT( trmm_blk_var3 )
GENPROT( trmm_xx_ker_var2 )
GENPROT( trmm_ll_ker_var2 )
GENPROT( trmm_lu_ker_var2 )
GENPROT( trmm_rl_ker_var2 )
GENPROT( trmm_ru_ker_var2 )
//
// Prototype BLAS-like interfaces with void pointer operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoff, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, \
dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 )
INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/bli_trmm_xx_ker_var2.c 0000664 0000000 0000000 00000005423 14634250137 0024331 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static l3_var_oft vars[2][2] =
{
{ bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 },
{ bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 }
};
void bli_trmm_xx_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
dim_t side;
dim_t uplo;
l3_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
side = 0;
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
else uplo = 1;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
side = 1;
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
else uplo = 1;
}
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo];
// Call the macrokernel.
f
(
a,
b,
c,
cntx,
rntm,
cntl,
thread
);
}
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/ 0000775 0000000 0000000 00000000000 14634250137 0021163 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_ll_ker_var2.c 0000664 0000000 0000000 00000037203 14634250137 0025423 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
void bli_trmm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c 0000664 0000000 0000000 00000041055 14634250137 0025767 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_ll_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists).
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c 0000664 0000000 0000000 00000041045 14634250137 0025761 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_ll_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t off_a1011; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use slab assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists).
NOTE: Parallelism in the 1st loop is disabled for now. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1011 = 0; \
k_a1011 = bli_min( diagoffa_i + MR, k ); \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1011, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_lu_ker_var2.c 0000664 0000000 0000000 00000037457 14634250137 0025447 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
void bli_trmm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c 0000664 0000000 0000000 00000041235 14634250137 0026000 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_lu_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c 0000664 0000000 0000000 00000041226 14634250137 0025773 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_lu_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t off_a1112; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current block of A is entirely below the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
/*dim_t ir_nt = bli_thread_n_way( ir_thread ); \
dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
\
dim_t jr_start, jr_end; \
/*dim_t ir_start, ir_end;*/ \
dim_t jr_inc; \
\
/* Use slab assignment of micropanels to threads in the 2nd loop for
the initial rectangular region of C (if it exists). */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, scale C
by beta. If it is strictly above the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict b1_i; \
ctype* restrict a2; \
\
/* Determine the offset to and length of the panel that was
packed so we can index into the corresponding location in
b1. */ \
off_a1112 = diagoffa_i; \
k_a1112 = k - off_a1112; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_a1112, \
alpha_cast, \
a1, \
b1_i, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
/* NOTE: ir loop parallelism disabled for now. */ \
/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
\
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
/*}*/ \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_rl_ker_var2.c 0000664 0000000 0000000 00000040151 14634250137 0025425 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
void bli_trmm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c 0000664 0000000 0000000 00000044752 14634250137 0026004 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_rl_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of B, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. (There should never be any remainder in this division.) The
number of iterations in the triangular (or trapezoidal) region is
computed as the remaining number of iterations in the n dimension. */ \
n_iter_rct = diagoffb / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops for the initial rectangular region of B (if it exists). */ \
bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
jr_start = n_iter_rct; \
b1 = b_cast + jr_start * cstep_b; \
c1 = c_cast + jr_start * cstep_c; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c 0000664 0000000 0000000 00000044743 14634250137 0025777 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_rl_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t off_b1121; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above the diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it to prevent
"no-op" iterations from executing. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the rectangular
part of B, and the triangular portion. */ \
dim_t n_iter_rct; \
dim_t n_iter_tri; \
\
if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the second set of
loops. */ \
n_iter_rct = n_iter; \
n_iter_tri = 0; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the rectangular region by dividing NR into the diagonal
offset. (There should never be any remainder in this division.) The
number of iterations in the triangular (or trapezoidal) region is
computed as the remaining number of iterations in the n dimension. */ \
n_iter_rct = diagoffb / NR; \
n_iter_tri = n_iter - n_iter_rct; \
} \
\
/* Use slab assignment of micropanels to threads in the 2nd and 1st
loops for the initial rectangular region of B (if it exists). */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
/* If there is no triangular region, then we're done. */ \
if ( n_iter_tri == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the remaining triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
/* Advance the starting b1 and c1 pointers to the positions corresponding
to the start of the triangular region of B. */ \
jr_start = n_iter_rct; \
b1 = b_cast + jr_start * cstep_b; \
c1 = c_cast + jr_start * cstep_c; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to the beginning of the panel that
was packed so we can index into the corresponding location
in A. Then compute the length of that panel. */ \
off_b1121 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b1121; \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b1121, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_ru_ker_var2.c 0000664 0000000 0000000 00000040164 14634250137 0025442 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
void bli_trmm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* jr_thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \
dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \
dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
if ( bli_trmm_my_iter( j, jr_thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, ir_thread ) ) { \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c 0000664 0000000 0000000 00000046604 14634250137 0026013 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trmm_ru_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j, jb0; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in B. (There should never be any remainder
in this division.) The number of iterations in the rectangular region
is computed as the remaining number of iterations in the n dimension. */ \
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter_tri; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd and 1st
loops the remaining triangular region of B. */ \
bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
jb0 = n_iter_tri; \
\
/* Save the resulting value of b1 from the previous loop since it represents
the starting point for the rectangular region. */ \
b_cast = b1; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
/* NOTE: We must index through b_cast differently since it contains
the starting address of the rectangular region (which is already
n_iter_tri logical iterations through B). */ \
b1 = b_cast + (j-jb0) * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c 0000664 0000000 0000000 00000046575 14634250137 0026015 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* beta,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trmm_ru_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
obj_t scalar_a;
obj_t scalar_b;
void* buf_alpha;
void* buf_beta;
FUNCPTR_T f;
// Detach and multiply the scalars attached to A and B.
bli_obj_scalar_detach( a, &scalar_a );
bli_obj_scalar_detach( b, &scalar_b );
bli_mulsc( &scalar_a, &scalar_b );
// Grab the addresses of the internal scalar buffers for the scalar
// merged above and the scalar attached to C.
buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
buf_beta = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_beta,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* beta, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Query the context for the micro-kernel address and cast it to its
function pointer type. */ \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict one = PASTEMAC(ch,1); \
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha_cast = alpha; \
ctype* restrict beta_cast = beta; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t off_b0111; \
dim_t i, j, jb0; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full. For all trmm, k_full is simply k. This is
needed because some parameter combinations of trmm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of A (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = k; \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. And if we are packing real-only, imag-only, or
summed-only, we need to scale the computed panel sizes by 1/2
to compensate for the fact that the pointer arithmetic occurs
in terms of complex elements rather than real elements. */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
loop around the microkernel. Here we query the thrinfo_t node for the
1st (ir) loop around the microkernel. */ \
thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
dim_t ir_nt = bli_thread_n_way( caucus ); \
dim_t ir_tid = bli_thread_work_id( caucus ); \
\
dim_t jr_start, jr_end; \
dim_t ir_start, ir_end; \
dim_t jr_inc, ir_inc; \
\
/* Note that we partition the 2nd loop into two regions: the triangular
part of C, and the rectangular portion. */ \
dim_t n_iter_tri; \
dim_t n_iter_rct; \
\
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
{ \
/* If the entire panel of B does not intersect the diagonal, there is
no triangular region, and therefore we can skip the first set of
loops. */ \
n_iter_tri = 0; \
n_iter_rct = n_iter; \
} \
else \
{ \
/* If the panel of B does intersect the diagonal, compute the number of
iterations in the triangular (or trapezoidal) region by dividing NR
into the number of rows in B. (There should never be any remainder
in this division.) The number of iterations in the rectangular region
is computed as the remaining number of iterations in the n dimension. */ \
n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
n_iter_rct = n_iter - n_iter_tri; \
} \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop
for the initial triangular region of B (if it exists).
NOTE: We don't need to call bli_thread_range_jrir*() here since we
employ a hack that calls for each thread to execute every iteration
of the jr and ir loops but skip all but the pointer increment for
iterations that are not assigned to it. */ \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter_tri; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
\
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b0111 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
\
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
if ( bli_trmm_my_iter( j, thread ) ) { \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trmm_my_iter( i, caucus ) ) { \
\
ctype* restrict a1_i; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Copy edge elements of C to the temporary buffer. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
c11, rs_c, cs_c, \
ct, rs_ct, cs_ct ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k_b0111, \
alpha_cast, \
a1_i, \
b1, \
beta_cast, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
} \
\
b1 += ps_b_cur; \
} \
\
c1 += cstep_c; \
} \
\
/* If there is no rectangular region, then we're done. */ \
if ( n_iter_rct == 0 ) return; \
\
/* Use slab assignment of micropanels to threads in the 2nd and 1st
loops the remaining triangular region of B. */ \
bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \
\
/* Advance the start and end iteration offsets for the rectangular region
by the number of iterations used for the triangular region. */ \
jr_start += n_iter_tri; \
jr_end += n_iter_tri; \
jb0 = n_iter_tri; \
\
/* Save the resulting value of b1 from the previous loop since it represents
the starting point for the rectangular region. */ \
b_cast = b1; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
/* NOTE: We must index through b_cast differently since it contains
the starting address of the rectangular region (which is already
n_iter_tri logical iterations through B). */ \
b1 = b_cast + (j-jb0) * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, scale C
by beta. If it is strictly below the diagonal, scale by one.
This allows the current macro-kernel to work for both trmm
and trmm3. */ \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = ir_start; i < ir_end; i += ir_inc ) \
{ \
ctype* restrict a2; \
\
a1 = a_cast + i * rstep_a; \
c11 = c1 + i * rstep_c; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \
{ \
a2 = a_cast; \
b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
one, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
alpha_cast, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
} \
} \
\
\
\
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
}
INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/trmm3/ 0000775 0000000 0000000 00000000000 14634250137 0020125 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/trmm3/bli_trmm3.h 0000664 0000000 0000000 00000003247 14634250137 0022174 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_trmm3_front.h"
cython-blis-1.0.0/blis/_src/frame/3/trmm3/bli_trmm3_front.c 0000664 0000000 0000000 00000014352 14634250137 0023376 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trmm3_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( beta, c );
return;
}
// Alias A, B, and C so we can tweak the objects if necessary.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( c, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply induce a transposition. This
// allows us to only explicitly implement the no-transpose cases. Once
// the transposition is induced, the correct algorithm will be called,
// since, for example, an algorithm over a transposed lower triangular
// matrix A moves in the same direction (forwards) as a non-transposed
// upper triangular matrix. And with the transposition induced, the
// matrix now appears to be upper triangular, so the upper triangular
// algorithm will grab the correct partitions, as if it were upper
// triangular (with no transpose) all along.
if ( bli_obj_has_trans( &a_local ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
#ifdef BLIS_DISABLE_TRMM3_RIGHT
// NOTE: This case casts right-side trmm3 in terms of left side. This is
// necessary when the current subconfiguration uses a gemm microkernel
// that assumes that the packing kernel will have already duplicated
// (broadcast) element of B in the packed copy of B. Supporting
// duplication within the logic that packs micropanels from triangular
// matrices would be ugly, and so we simply don't support it. As a
// consequence, those subconfigurations need a way to force the triangular
// matrix to be on the left (and thus the general matrix to the on the
// right). So our solution is that in those cases, the subconfigurations
// simply #define BLIS_DISABLE_TRMM3_RIGHT.
// NOTE: This case casts right-side trmm3 in terms of left side. This can
// lead to the microkernel being executed on an output matrix with the
// microkernel's general stride IO case (unless the microkernel supports
// both both row and column IO cases as well).
// NOTE: Casting right-side trmm3 in terms of left side reduces the number
// of macrokernels exercised to two (trmm_ll and trmm_lu).
// If A is being multiplied from the right, transpose all operands
// so that we can perform the computation as if A were being multiplied
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// An optimization: If C is stored by rows and the micro-kernel prefers
// contiguous columns, or if C is stored by columns and the micro-kernel
// prefers contiguous rows, transpose the entire operation to allow the
// micro-kernel to access elements of C in its preferred manner.
if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
// If A is being multiplied from the right, swap A and B so that
// the matrix will actually be on the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_TRMM3,
side,
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_TRMM, // operation family id
alpha,
&a_local,
&b_local,
beta,
&c_local,
cntx,
rntm,
cntl
);
}
cython-blis-1.0.0/blis/_src/frame/3/trmm3/bli_trmm3_front.h 0000664 0000000 0000000 00000003541 14634250137 0023401 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trmm3_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
obj_t* beta,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
cython-blis-1.0.0/blis/_src/frame/3/trsm/ 0000775 0000000 0000000 00000000000 14634250137 0020050 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm.h 0000664 0000000 0000000 00000003333 14634250137 0022036 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_trsm_cntl.h"
#include "bli_trsm_front.h"
#include "bli_trsm_var.h"
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_blk_var1.c 0000664 0000000 0000000 00000014256 14634250137 0023620 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//#define PRINT
void bli_trsm_blk_var1
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
dim_t my_start, my_end;
dim_t b_alg;
// Determine the direction in which to partition (forwards or backwards).
dir_t direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_m( a, b, c, cntl );
// Isolate the diagonal block A11 and its corresponding row panel C1.
const dim_t kc = bli_obj_width_after_trans( a );
obj_t a11, c1;
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
0, kc, a, &a11 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
0, kc, c, &c1 );
// All threads iterate over the entire diagonal block A11.
my_start = 0; my_end = kc;
#ifdef PRINT
printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n",
(int)bli_obj_length( &a11 ), (int)bli_obj_width( &a11 ),
(int)bli_obj_row_off( &a11 ), (int)bli_obj_col_off( &a11 ) );
printf( "bli_trsm_blk_var1(): entering trsm subproblem loop.\n" );
#endif
// Partition along the m dimension for the trsm subproblem.
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
obj_t a11_1, c1_1;
b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, &a11, &a11_1 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, &c1, &c1_1 );
#ifdef PRINT
printf( "bli_trsm_blk_var1(): a11_1 is %d x %d at offsets (%3d, %3d)\n",
(int)bli_obj_length( &a11_1 ), (int)bli_obj_width( &a11_1 ),
(int)bli_obj_row_off( &a11_1 ), (int)bli_obj_col_off( &a11_1 ) );
#endif
// Perform trsm subproblem.
bli_l3_int
(
&BLIS_ONE,
&a11_1,
b,
&BLIS_ONE,
&c1_1,
cntx,
rntm,
bli_cntl_sub_prenode( cntl ),
bli_thrinfo_sub_prenode( thread )
);
}
#ifdef PRINT
printf( "bli_trsm_blk_var1(): finishing trsm subproblem loop.\n" );
#endif
// We must execute a barrier here because the upcoming rank-k update
// requires the packed matrix B to be fully updated by the trsm
// subproblem.
bli_thread_barrier( thread );
// Isolate the remaining part of the column panel matrix A, which we do by
// acquiring the subpartition ahead of A11 (that is, A21 or A01, depending
// on whether we are moving forwards or backwards, respectively).
obj_t ax1, cx1;
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A,
0, kc, a, &ax1 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A,
0, kc, c, &cx1 );
#ifdef PRINT
printf( "bli_trsm_blk_var1(): ax1 is %d x %d at offsets (%3d, %3d)\n",
(int)bli_obj_length( &ax1 ), (int)bli_obj_width( &ax1 ),
(int)bli_obj_row_off( &ax1 ), (int)bli_obj_col_off( &ax1 ) );
#endif
// Determine the current thread's subpartition range for the gemm
// subproblem over Ax1.
bli_thread_range_mdim
(
direct, thread, &ax1, b, &cx1, cntl, cntx,
&my_start, &my_end
);
#ifdef PRINT
printf( "bli_trsm_blk_var1(): entering gemm subproblem loop (%d->%d).\n", (int)my_start, (int)my_end );
#endif
// Partition along the m dimension for the gemm subproblem.
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
obj_t a11, c1;
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize( direct, i, my_end, &ax1,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and C1.
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, &ax1, &a11 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, &cx1, &c1 );
#ifdef PRINT
printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n",
(int)bli_obj_length( &a11 ), (int)bli_obj_width( &a11 ),
(int)bli_obj_row_off( &a11 ), (int)bli_obj_col_off( &a11 ) );
#endif
// Perform gemm subproblem. (Note that we use the same backend
// function as before, since we're calling the same macrokernel.)
bli_l3_int
(
&BLIS_ONE,
&a11,
b,
&BLIS_ONE,
&c1,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
#ifdef PRINT
printf( "bli_trsm_blk_var1(): finishing gemm subproblem loop.\n" );
#endif
}
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_blk_var2.c 0000664 0000000 0000000 00000006003 14634250137 0023610 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsm_blk_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t b1, c1;
dim_t my_start, my_end;
dim_t b_alg;
// Determine the direction in which to partition (forwards or backwards).
dir_t direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_n( a, b, c, cntl );
// Determine the current thread's subpartition range.
bli_thread_range_ndim
(
direct, thread, a, b, c, cntl, cntx,
&my_start, &my_end
);
// Partition along the n dimension.
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_determine_blocksize( direct, i, my_end, b,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for B1 and C1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
i, b_alg, b, &b1 );
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
i, b_alg, c, &c1 );
// Perform trsm subproblem.
bli_l3_int
(
&BLIS_ONE,
a,
&b1,
&BLIS_ONE,
&c1,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
}
}
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_blk_var3.c 0000664 0000000 0000000 00000006403 14634250137 0023615 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsm_blk_var3
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
obj_t a1, b1;
dim_t b_alg;
// Determine the direction in which to partition (forwards or backwards).
dir_t direct = bli_l3_direct( a, b, c, cntl );
// Prune any zero region that exists along the partitioning dimension.
bli_l3_prune_unref_mparts_k( a, b, c, cntl );
// Query dimension in partitioning direction.
dim_t k_trans = bli_obj_width_after_trans( a );
// Partition along the k dimension.
for ( dim_t i = 0; i < k_trans; i += b_alg )
{
// Determine the current algorithmic blocksize.
b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b,
bli_cntl_bszid( cntl ), cntx );
// Acquire partitions for A1 and B1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
i, b_alg, a, &a1 );
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, b, &b1 );
// Perform trsm subproblem.
bli_l3_int
(
&BLIS_ONE,
&a1,
&b1,
&BLIS_ONE,
c,
cntx,
rntm,
bli_cntl_sub_node( cntl ),
bli_thrinfo_sub_node( thread )
);
//bli_thread_ibarrier( thread );
bli_thread_barrier( bli_thrinfo_sub_node( thread ) );
// This variant executes multiple rank-k updates. Therefore, if the
// internal alpha scalars on A/B and C are non-zero, we must ensure
// that they are only used in the first iteration.
if ( i == 0 )
{
bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b );
bli_obj_scalar_reset( c );
}
}
}
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_cntl.c 0000664 0000000 0000000 00000020562 14634250137 0023054 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cntl_t* bli_trsm_cntl_create
(
rntm_t* rntm,
side_t side,
pack_t schema_a,
pack_t schema_b,
void_fp ker
)
{
if ( bli_is_left( side ) )
return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker );
else
return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker );
}
cntl_t* bli_trsm_l_cntl_create
(
rntm_t* rntm,
pack_t schema_a,
pack_t schema_b,
void_fp ker
)
{
void_fp macro_kernel_p;
// Set the default macrokernel. If a non-NULL kernel function pointer is
// passed in, we use that instead.
macro_kernel_p = bli_trsm_xx_ker_var2;
if ( ker ) macro_kernel_p = ker;
const opid_t family = BLIS_TRSM;
//
// Create nodes for packing A and the macro-kernel (gemm branch).
//
cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node
(
rntm, // the thread's runtime structure
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
gemm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_l3_packa, // trsm operation's packm function for A.
BLIS_MR,
BLIS_MR,
FALSE, // do NOT invert diagonal
TRUE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
schema_a, // normally BLIS_PACKED_ROW_PANELS
BLIS_BUFFER_FOR_A_BLOCK,
gemm_cntl_bp_bu
);
//
// Create nodes for packing A and the macro-kernel (trsm branch).
//
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
(
rntm, // the thread's runtime structure
family, // the operation family
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
trsm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_l3_packa, // trsm operation's packm function for A.
BLIS_MR,
BLIS_MR,
#ifdef BLIS_ENABLE_TRSM_PREINVERSION
TRUE, // invert diagonal
#else
FALSE, // do NOT invert diagonal
#endif
TRUE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
schema_a, // normally BLIS_PACKED_ROW_PANELS
BLIS_BUFFER_FOR_A_BLOCK,
trsm_cntl_bp_bu
);
// -------------------------------------------------------------------------
// Create a node for partitioning the m dimension by MC.
// NOTE: We attach the gemm sub-tree as the main branch.
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_MC,
bli_trsm_blk_var1,
gemm_cntl_packa
);
// Attach the trsm sub-tree as the auxiliary "prenode" branch.
bli_cntl_set_sub_prenode( trsm_cntl_packa, trsm_cntl_op_bp );
// -------------------------------------------------------------------------
// Create a node for packing matrix B.
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
rntm,
bli_l3_packb,
BLIS_NR,
BLIS_MR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
schema_b, // normally BLIS_PACKED_COL_PANELS
BLIS_BUFFER_FOR_B_PANEL,
trsm_cntl_op_bp
);
// Create a node for partitioning the k dimension by KC.
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_KC,
bli_trsm_blk_var3,
trsm_cntl_packb
);
// Create a node for partitioning the n dimension by NC.
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_NC,
bli_trsm_blk_var2,
trsm_cntl_mm_op
);
return trsm_cntl_vl_mm;
}
cntl_t* bli_trsm_r_cntl_create
(
rntm_t* rntm,
pack_t schema_a,
pack_t schema_b,
void_fp ker
)
{
// NOTE: trsm macrokernels are presently disabled for right-side execution.
// Set the default macrokernel. If a non-NULL kernel function pointer is
// passed in, we use that instead.
void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
if ( ker ) macro_kernel_p = ker;
const opid_t family = BLIS_TRSM;
// Create two nodes for the macro-kernel.
cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_MR, // needed for bli_thrinfo_rgrow()
NULL, // variant function pointer not used
NULL // no sub-node; this is the leaf of the tree.
);
cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
macro_kernel_p,
trsm_cntl_bu_ke
);
// Create a node for packing matrix A.
cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
(
rntm,
bli_l3_packa,
BLIS_NR,
BLIS_MR,
FALSE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
FALSE, // reverse iteration if lower?
schema_a, // normally BLIS_PACKED_ROW_PANELS
BLIS_BUFFER_FOR_A_BLOCK,
trsm_cntl_bp_bu
);
// Create a node for partitioning the m dimension by MC.
cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_MC,
bli_trsm_blk_var1,
trsm_cntl_packa
);
// Create a node for packing matrix B.
cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
(
rntm,
bli_l3_packb,
BLIS_MR,
BLIS_MR,
TRUE, // do NOT invert diagonal
FALSE, // reverse iteration if upper?
TRUE, // reverse iteration if lower?
schema_b, // normally BLIS_PACKED_COL_PANELS
BLIS_BUFFER_FOR_B_PANEL,
trsm_cntl_op_bp
);
// Create a node for partitioning the k dimension by KC.
cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_KC,
bli_trsm_blk_var3,
trsm_cntl_packb
);
// Create a node for partitioning the n dimension by NC.
cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
(
rntm,
family,
BLIS_NC,
bli_trsm_blk_var2,
trsm_cntl_mm_op
);
return trsm_cntl_vl_mm;
}
void bli_trsm_cntl_free
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
bli_cntl_free( rntm, cntl, thread );
}
// -----------------------------------------------------------------------------
cntl_t* bli_trsm_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
cntl_t* sub_node
)
{
return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
}
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_cntl.h 0000664 0000000 0000000 00000004732 14634250137 0023062 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
cntl_t* bli_trsm_cntl_create
(
rntm_t* rntm,
side_t side,
pack_t schema_a,
pack_t schema_b,
void_fp ker
);
cntl_t* bli_trsm_l_cntl_create
(
rntm_t* rntm,
pack_t schema_a,
pack_t schema_b,
void_fp ker
);
cntl_t* bli_trsm_r_cntl_create
(
rntm_t* rntm,
pack_t schema_a,
pack_t schema_b,
void_fp ker
);
void bli_trsm_cntl_free
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
// -----------------------------------------------------------------------------
cntl_t* bli_trsm_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
cntl_t* sub_node
);
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_front.c 0000664 0000000 0000000 00000012124 14634250137 0023237 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_trsm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
)
{
bli_init_once();
obj_t a_local;
obj_t b_local;
obj_t c_local;
#if 0
#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl );
if ( status == BLIS_SUCCESS ) return;
#endif
#endif
// If alpha is zero, scale by beta and return.
if ( bli_obj_equals( alpha, &BLIS_ZERO ) )
{
bli_scalm( alpha, b );
return;
}
// Alias A and B so we can tweak the objects if necessary.
bli_obj_alias_to( a, &a_local );
bli_obj_alias_to( b, &b_local );
bli_obj_alias_to( b, &c_local );
// Set the obj_t buffer field to the location currently implied by the row
// and column offsets and then zero the offsets. If any of the original
// obj_t's were views into larger matrices, this step effectively makes
// those obj_t's "forget" their lineage.
bli_obj_reset_origin( &a_local );
bli_obj_reset_origin( &b_local );
bli_obj_reset_origin( &c_local );
// We do not explicitly implement the cases where A is transposed.
// However, we can still handle them. Specifically, if A is marked as
// needing a transposition, we simply induce a transposition. This
// allows us to only explicitly implement the no-transpose cases. Once
// the transposition is induced, the correct algorithm will be called,
// since, for example, an algorithm over a transposed lower triangular
// matrix A moves in the same direction (forwards) as a non-transposed
// upper triangular matrix. And with the transposition induced, the
// matrix now appears to be upper triangular, so the upper triangular
// algorithm will grab the correct partitions, as if it were upper
// triangular (with no transpose) all along.
if ( bli_obj_has_trans( &a_local ) )
{
bli_obj_induce_trans( &a_local );
bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local );
}
#if 1
// If A is being solved against from the right, transpose all operands
// so that we can perform the computation as if A were being solved
// from the left.
if ( bli_is_right( side ) )
{
bli_toggle_side( &side );
bli_obj_induce_trans( &a_local );
bli_obj_induce_trans( &b_local );
bli_obj_induce_trans( &c_local );
}
#else
// NOTE: Enabling this code requires that BLIS NOT be configured with
// BLIS_RELAX_MCNR_NCMR_CONSTRAINTS defined.
#ifdef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS
#error "BLIS_RELAX_MCNR_NCMR_CONSTRAINTS must not be defined for current trsm_r implementation."
#endif
// If A is being solved against from the right, swap A and B so that
// the triangular matrix will actually be on the right.
if ( bli_is_right( side ) )
{
bli_obj_swap( &a_local, &b_local );
}
#endif
// Set the pack schemas within the objects.
bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
// Parse and interpret the contents of the rntm_t object to properly
// set the ways of parallelism for each loop, and then make any
// additional modifications necessary for the current operation.
bli_rntm_set_ways_for_op
(
BLIS_TRSM,
side,
bli_obj_length( &c_local ),
bli_obj_width( &c_local ),
bli_obj_width( &a_local ),
rntm
);
// Invoke the internal back-end.
bli_l3_thread_decorator
(
bli_l3_int,
BLIS_TRSM, // operation family id
alpha,
&a_local,
&b_local,
alpha,
&c_local,
cntx,
rntm,
cntl
);
}
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_front.h 0000664 0000000 0000000 00000004071 14634250137 0023246 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_trsm_front
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl
);
#ifdef BLIS_ENABLE_SMALL_MATRIX
err_t bli_trsm_small
(
side_t side,
obj_t* alpha,
obj_t* a,
obj_t* b,
cntx_t* cntx,
cntl_t* cntl
);
#endif
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_ll_ker_var2.c 0000664 0000000 0000000 00000037064 14634250137 0024323 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
void bli_trsm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
/*
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = is_a_cur; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
a11 = a1 + k_a10 * PACKMR; \
/*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/ \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + off_a10 * PACKNR; \
b11 = b1 + off_a11 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
gemmtrsm_ukr \
( \
m_cur, \
n_cur, \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_lu_ker_var2.c 0000664 0000000 0000000 00000040033 14634250137 0024322 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
void bli_trsm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
/*
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + i * PACKNR; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Determine the thread range and increment for the 2nd loop.
NOTE: The definition of bli_thread_range_jrir() will depend on whether
slab or round-robin partitioning was requested at configure-time.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = is_a_cur; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
a12 = a1 + k_a11 * PACKMR; \
/*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/ \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + off_a11 * PACKNR; \
b21 = b1 + off_a12 * PACKNR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
gemmtrsm_ukr \
( \
m_cur, \
n_cur, \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_rl_ker_var2.c 0000664 0000000 0000000 00000040400 14634250137 0024315 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
void bli_trsm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to A (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of A prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
the current macro-kernel targets the "rl" case (right-side/lower-
triangular), it becomes upper-triangular after the kernel operation
is transposed so that all kernel instances are of the "left"
variety (since those are the only trsm ukernels that exist). */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
/*
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t k_b11; \
dim_t k_b21; \
dim_t off_b11; \
dim_t off_b21; \
dim_t i, j, jb; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKNR
pd_a == NR
ps_a == stride to next micro-panel of A
rs_b == PACKMR
cs_b == 1
pd_b == MR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
swapping of values in the control tree (ie: those values used when
packing). This swapping is needed since we cast right-hand trsm in
terms of transposed left-hand trsm. So, if we're going to be
transposing the operation, then A needs to be packed with NR and B
needs to be packed with MR (remember: B is the triangular matrix in
the right-hand side parameter case).
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of NR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + j * PACKMR; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it so that
we can index to the correct place in C (corresponding to the
part of the panel of B that was packed).
NOTE: This is NOT being done to skip over "no-op" iterations,
as with the trsm_lu macro-kernel. This MUST be done for correct
execution because we use n (via n_iter) to compute diagonal and
index offsets for backwards movement through B. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of NR. If k
isn't a multiple of NR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an NR x NR triangular solve.
This adjustment of k is consistent with what happened when B was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of A. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
know that the underlying buffer was already allocated to have an n
dimension that is a multiple of PACKNR, with the region between the
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_schema_a( schema_b, &aux ); \
bli_auxinfo_set_schema_b( schema_a, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( jb = 0; jb < n_iter; ++jb ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict b2; \
\
j = n_iter - 1 - jb; \
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
c11 = c1 + (n_iter-1)*cstep_c; \
\
n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of B resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b11 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b11; \
k_b11 = NR; \
k_b21 = k_b1121 - NR; \
off_b21 = off_b11 + k_b11; \
\
/* Compute the addresses of the triangular block B11 and the
panel B21. */ \
b11 = b1; \
b21 = b1 + k_b11 * PACKNR; \
/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = is_b_cur; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A11 block and A12 panel. */ \
a11 = a1 + off_b11 * PACKMR; \
a12 = a1 + off_b21 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
gemmtrsm_ukr \
( \
m_cur, \
n_cur, \
k_b21, \
alpha1_cast, \
b21, \
b11, \
a12, \
a11, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
\
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
minus_one, \
b1, \
a1, \
alpha2_cast, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
\
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
} \
\
c1 -= cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_ru_ker_var2.c 0000664 0000000 0000000 00000037616 14634250137 0024345 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
void bli_trsm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to A (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of A prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
the current macro-kernel targets the "ru" case (right-side/upper-
triangular), it becomes lower-triangular after the kernel operation
is transposed so that all kernel instances are of the "left"
variety (since those are the only trsm ukernels that exist). */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
/*
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
*/ \
\
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t k_b01; \
dim_t off_b01; \
dim_t off_b11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKNR
pd_a == NR
ps_a == stride to next micro-panel of A
rs_b == PACKMR
cs_b == 1
pd_b == MR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
swapping of values in the control tree (ie: those values used when
packing). This swapping is needed since we cast right-hand trsm in
terms of transposed left-hand trsm. So, if we're going to be
transposing the operation, then A needs to be packed with NR and B
needs to be packed with MR (remember: B is the triangular matrix in
the right-hand side parameter case).
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of NR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Check the k dimension, which needs to be a multiple of NR. If k
isn't a multiple of NR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an NR x NR triangular solve.
This adjustment of k is consistent with what happened when B was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of A. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
know that the underlying buffer was already allocated to have an n
dimension that is a multiple of PACKNR, with the region between the
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_schema_a( schema_b, &aux ); \
bli_auxinfo_set_schema_b( schema_a, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of B resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b01 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
k_b01 = k_b0111 - NR; \
off_b11 = k_b01; \
\
/* Compute the addresses of the panel B10 and the triangular
block B11. */ \
b01 = b1; \
b11 = b1 + k_b01 * PACKNR; \
/*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = is_b_cur; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A10 panel and A11 block. */ \
a10 = a1 + off_b01 * PACKMR; \
a11 = a1 + off_b11 * PACKMR; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
gemmtrsm_ukr \
( \
m_cur, \
n_cur, \
k_b01, \
alpha1_cast, \
b01, \
b11, \
a10, \
a11, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
\
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if ( bli_trsm_my_iter_rr( i, thread ) ){ \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
m_cur, \
n_cur, \
k, \
minus_one, \
b1, \
a1, \
alpha2_cast, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
\
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_var.h 0000664 0000000 0000000 00000005753 14634250137 0022716 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interfaces.
//
#undef GENPROT
#define GENPROT( opname ) \
\
void PASTEMAC0(opname) \
( \
obj_t* a, \
obj_t* b, \
obj_t* c, \
cntx_t* cntx, \
rntm_t* rntm, \
cntl_t* cntl, \
thrinfo_t* thread \
);
GENPROT( trsm_blk_var1 )
GENPROT( trsm_blk_var2 )
GENPROT( trsm_blk_var3 )
GENPROT( trsm_xx_ker_var2 )
GENPROT( trsm_ll_ker_var2 )
GENPROT( trsm_lu_ker_var2 )
GENPROT( trsm_rl_ker_var2 )
GENPROT( trsm_ru_ker_var2 )
//
// Prototype BLAS-like interfaces with void pointer operands.
//
#undef GENTPROT
#define GENTPROT( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoff, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, \
dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, \
dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
);
INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/bli_trsm_xx_ker_var2.c 0000664 0000000 0000000 00000005423 14634250137 0024345 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static l3_var_oft vars[2][2] =
{
{ bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 },
{ bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 }
};
void bli_trsm_xx_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
dim_t side;
dim_t uplo;
l3_var_oft f;
// Set two bools: one based on the implied side parameter (the structure
// of the root object) and one based on the uplo field of the triangular
// matrix's root object (whether that is matrix A or matrix B).
if ( bli_obj_root_is_triangular( a ) )
{
side = 0;
if ( bli_obj_root_is_lower( a ) ) uplo = 0;
else uplo = 1;
}
else // if ( bli_obj_root_is_triangular( b ) )
{
side = 1;
if ( bli_obj_root_is_lower( b ) ) uplo = 0;
else uplo = 1;
}
// Index into the variant array to extract the correct function pointer.
f = vars[side][uplo];
// Call the macrokernel.
f
(
a,
b,
c,
cntx,
rntm,
cntl,
thread
);
}
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/ 0000775 0000000 0000000 00000000000 14634250137 0021171 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_ll_ker_var2.c 0000664 0000000 0000000 00000046442 14634250137 0025444 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
void bli_trsm_ll_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if( bli_trsm_my_iter( j, thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c 0000664 0000000 0000000 00000047713 14634250137 0026012 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trsm_ll_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c 0000664 0000000 0000000 00000047704 14634250137 0026005 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trsm_ll_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1011; \
dim_t k_a10; \
dim_t off_a10; \
dim_t off_a11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region above where the diagonal of A intersects the
left edge of the block, adjust the pointer to C and treat this case as
if the diagonal offset were zero. This skips over the region that was
not packed. (Note we assume the diagonal offset is a multiple of MR;
this assumption will hold as long as the cache blocksizes are each a
multiple of MR and NR.) */ \
if ( diagoffa < 0 ) \
{ \
i = -diagoffa; \
m = m - i; \
diagoffa = 0; \
c_cast = c_cast + (i )*rs_c; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use slab assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (0 )*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a10 = 0; \
k_a1011 = diagoffa_i + MR; \
k_a10 = k_a1011 - MR; \
off_a11 = k_a10; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1011 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the panel A10 and the triangular
block A11. */ \
a10 = a1; \
/* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \
a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a10, \
alpha1_cast, \
a10, \
a11, \
b01, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 += rstep_c; \
} \
} \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
*/ \
\
/*
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
( double* )a11, 1, PACKMR, "%4.1f", "" ); \
*/ \
\
/*
if ( bli_is_4mi_packed( schema_a ) ){ \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \
( double* )b, rs_b, 1, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \
( double* )b+72, rs_b, 1, "%4.1f", "" ); \
}else{ \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \
( double* )b, 2*rs_b, 2, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \
( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \
} \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \
( double* )c, 1, cs_c, "%4.1f", "" ); \
PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \
( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_lu_ker_var2.c 0000664 0000000 0000000 00000044531 14634250137 0025452 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
void bli_trsm_lu_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
if( bli_trsm_my_iter( j, thread ) ) { \
\
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
/*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\
if ( j + bli_thread_num_threads(thread) >= n_iter ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
b1 += cstep_b; \
c1 += cstep_c; \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c 0000664 0000000 0000000 00000046002 14634250137 0026011 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2rr);
//
// -- Macrokernel functions for round-robin partitioning -----------------------
//
void bli_trsm_lu_ker_var2rr
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use round-robin assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2rr )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c 0000664 0000000 0000000 00000045773 14634250137 0026022 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffa,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2sl);
//
// -- Macrokernel functions for slab partitioning ------------------------------
//
void bli_trsm_lu_ker_var2sl
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffa = bli_obj_diag_offset( a );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to B (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of B prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( b );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffa,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffa, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffa_i; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_a1112; \
dim_t k_a11; \
dim_t k_a12; \
dim_t off_a11; \
dim_t off_a12; \
dim_t i, j, ib; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_a_num; \
inc_t ss_a_den; \
inc_t ps_a_cur; \
inc_t is_a_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKMR
pd_a == MR
ps_a == stride to next micro-panel of A
rs_b == PACKNR
cs_b == 1
pd_b == NR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of MR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_a ) || \
bli_is_3mi_packed( schema_a ) || \
bli_is_rih_packed( schema_a ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \
else { ss_a_num = 1; ss_a_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of A
intersects the top edge of the block, adjust the pointer to B and
treat this case as if the diagonal offset were zero. Note that we
don't need to adjust the pointer to A since packm would have simply
skipped over the region that was not stored. */ \
if ( diagoffa > 0 ) \
{ \
i = diagoffa; \
k = k - i; \
diagoffa = 0; \
b_cast = b_cast + ( i * PACKNR ) / off_scl; \
} \
\
/* If there is a zero region below where the diagonal of A intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffa + k < m ) \
{ \
m = -diagoffa + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of MR. If k
isn't a multiple of MR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an MR x MR triangular solve.
This adjustment of k is consistent with what happened when A was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of B. */ \
if ( k % MR != 0 ) k += MR - ( k % MR ); \
\
/* NOTE: We don't need to check that m is a multiple of PACKMR since we
know that the underlying buffer was already allocated to have an m
dimension that is a multiple of PACKMR, with the region between the
last row and the next multiple of MR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k; \
istep_b = PACKNR * k_full; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object. */ \
bli_auxinfo_set_schema_a( schema_a, &aux ); \
bli_auxinfo_set_schema_b( schema_b, &aux ); \
\
/* Save the imaginary stride of B to the auxinfo_t object. */ \
bli_auxinfo_set_is_b( istep_b, &aux ); \
\
/* We don't bother querying the thrinfo_t node for the 1st loop because
we can't parallelize that loop in trsm due to the inter-iteration
dependencies that exist. */ \
/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
\
/* Query the number of threads and thread ids for each loop. */ \
dim_t jr_nt = bli_thread_n_way( thread ); \
dim_t jr_tid = bli_thread_work_id( thread ); \
\
dim_t jr_start, jr_end; \
dim_t jr_inc; \
\
/* Use slab assignment of micropanels to threads in the 2nd loop.
NOTE: Parallelism in the 1st loop is unattainable due to the
inter-iteration dependencies present in trsm. */ \
bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = jr_start; j < jr_end; j += jr_inc ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b2; \
\
b1 = b_cast + j * cstep_b; \
c1 = c_cast + j * cstep_c; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
a1 = a_cast; \
c11 = c1 + (m_iter-1)*rstep_c; \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( ib = 0; ib < m_iter; ++ib ) \
{ \
i = m_iter - 1 - ib; \
diagoffa_i = diagoffa + ( doff_t )i*MR; \
\
m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
\
/* If the current panel of A intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of A resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict a2; \
\
/* Compute various offsets into and lengths of parts of A. */ \
off_a11 = diagoffa_i; \
k_a1112 = k - off_a11;; \
k_a11 = MR; \
k_a12 = k_a1112 - MR; \
off_a12 = off_a11 + k_a11; \
\
/* Compute the panel stride for the current diagonal-
intersecting micro-panel. */ \
is_a_cur = k_a1112 * PACKMR; \
is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \
\
/* Compute the addresses of the triangular block A11 and the
panel A12. */ \
a11 = a1; \
/* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \
a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \
\
/* Compute the addresses of the panel B01 and the block
B11. */ \
b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \
b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + ps_a_cur; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( is_a_cur, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_a12, \
alpha1_cast, \
a12, \
a11, \
b21, \
b11, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
\
a1 += ps_a_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
{ \
ctype* restrict a2; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1 + rstep_a; \
if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
{ \
a2 = a_cast; \
b2 = b1; \
if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. */ \
bli_auxinfo_set_next_a( a2, &aux ); \
bli_auxinfo_set_next_b( b2, &aux ); \
\
/* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t
object. */ \
bli_auxinfo_set_is_a( istep_a, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
alpha2_cast, \
c11, rs_c, cs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
a1, \
b1, \
zero, \
ct, rs_ct, cs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
\
a1 += rstep_a; \
} \
\
c11 -= rstep_c; \
} \
} \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
printf( "m_iter = %lu\n", m_iter ); \
printf( "m_cur = %lu\n", m_cur ); \
printf( "k = %lu\n", k ); \
printf( "diagoffa_i = %lu\n", diagoffa_i ); \
printf( "off_a1112 = %lu\n", off_a1112 ); \
printf( "k_a1112 = %lu\n", k_a1112 ); \
printf( "k_a12 = %lu\n", k_a12 ); \
printf( "k_a11 = %lu\n", k_a11 ); \
printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \
printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
*/ \
\
/*
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
*/ \
}
INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2sl )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_rl_ker_var2.c 0000664 0000000 0000000 00000046721 14634250137 0025452 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
void bli_trsm_rl_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to A (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of A prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
the current macro-kernel targets the "rl" case (right-side/lower-
triangular), it becomes upper-triangular after the kernel operation
is transposed so that all kernel instances are of the "left"
variety (since those are the only trsm ukernels that exist). */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b1121; \
dim_t k_b11; \
dim_t k_b21; \
dim_t off_b11; \
dim_t off_b21; \
dim_t i, j, jb; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKNR
pd_a == NR
ps_a == stride to next micro-panel of A
rs_b == PACKMR
cs_b == 1
pd_b == MR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
swapping of values in the control tree (ie: those values used when
packing). This swapping is needed since we cast right-hand trsm in
terms of transposed left-hand trsm. So, if we're going to be
transposing the operation, then A needs to be packed with NR and B
needs to be packed with MR (remember: B is the triangular matrix in
the right-hand side parameter case).
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely above its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of NR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region above where the diagonal of B intersects
the left edge of the panel, adjust the pointer to A and treat this
case as if the diagonal offset were zero. Note that we don't need to
adjust the pointer to B since packm would have simply skipped over
the region that was not stored. */ \
if ( diagoffb < 0 ) \
{ \
j = -diagoffb; \
k = k - j; \
diagoffb = 0; \
a_cast = a_cast + ( j * PACKMR ) / off_scl; \
} \
\
/* If there is a zero region to the right of where the diagonal
of B intersects the bottom of the panel, shrink it so that
we can index to the correct place in C (corresponding to the
part of the panel of B that was packed).
NOTE: This is NOT being done to skip over "no-op" iterations,
as with the trsm_lu macro-kernel. This MUST be done for correct
execution because we use n (via n_iter) to compute diagonal and
index offsets for backwards movement through B. */ \
if ( diagoffb + k < n ) \
{ \
n = diagoffb + k; \
} \
\
/* Check the k dimension, which needs to be a multiple of NR. If k
isn't a multiple of NR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an NR x NR triangular solve.
This adjustment of k is consistent with what happened when B was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of A. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
know that the underlying buffer was already allocated to have an n
dimension that is a multiple of PACKNR, with the region between the
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_schema_a( schema_b, &aux ); \
bli_auxinfo_set_schema_b( schema_a, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( jb = 0; jb < n_iter; ++jb ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b11; \
ctype* restrict b21; \
ctype* restrict b2; \
\
j = n_iter - 1 - jb; \
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
c11 = c1 + (n_iter-1)*cstep_c; \
\
n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of B resides below the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is above the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b11 = bli_max( -diagoffb_j, 0 ); \
k_b1121 = k - off_b11; \
k_b11 = NR; \
k_b21 = k_b1121 - NR; \
off_b21 = off_b11 + k_b11; \
\
/* Compute the addresses of the triangular block B11 and the
panel B21. */ \
b11 = b1; \
/* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \
b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b1121 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a11; \
ctype* restrict a12; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A11 block and A12 panel. */ \
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b21, \
alpha1_cast, \
b21, \
b11, \
a12, \
a11, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b21, \
alpha1_cast, \
b21, \
b11, \
a12, \
a11, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
alpha2_cast, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
zero, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
} \
\
c1 -= cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/3/trsm/other/bli_trsm_ru_ker_var2.c 0000664 0000000 0000000 00000046121 14634250137 0025455 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T gemm_fp
typedef void (*FUNCPTR_T)
(
doff_t diagoffb,
pack_t schema_a,
pack_t schema_b,
dim_t m,
dim_t n,
dim_t k,
void* alpha1,
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
void* alpha2,
void* c, inc_t rs_c, inc_t cs_c,
cntx_t* cntx,
rntm_t* rntm,
thrinfo_t* thread
);
static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
void bli_trsm_ru_ker_var2
(
obj_t* a,
obj_t* b,
obj_t* c,
cntx_t* cntx,
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
num_t dt_exec = bli_obj_exec_dt( c );
doff_t diagoffb = bli_obj_diag_offset( b );
pack_t schema_a = bli_obj_pack_schema( a );
pack_t schema_b = bli_obj_pack_schema( b );
dim_t m = bli_obj_length( c );
dim_t n = bli_obj_width( c );
dim_t k = bli_obj_width( a );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t cs_a = bli_obj_col_stride( a );
dim_t pd_a = bli_obj_panel_dim( a );
inc_t ps_a = bli_obj_panel_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
dim_t pd_b = bli_obj_panel_dim( b );
inc_t ps_b = bli_obj_panel_stride( b );
void* buf_c = bli_obj_buffer_at_off( c );
inc_t rs_c = bli_obj_row_stride( c );
inc_t cs_c = bli_obj_col_stride( c );
void* buf_alpha1;
void* buf_alpha2;
FUNCPTR_T f;
// Grab the address of the internal scalar buffer for the scalar
// attached to A (the non-triangular matrix). This will be the alpha
// scalar used in the gemmtrsm subproblems (ie: the scalar that would
// be applied to the packed copy of A prior to it being updated by
// the trsm subproblem). This scalar may be unit, if for example it
// was applied during packing.
buf_alpha1 = bli_obj_internal_scalar_buffer( a );
// Grab the address of the internal scalar buffer for the scalar
// attached to C. This will be the "beta" scalar used in the gemm-only
// subproblems that correspond to micro-panels that do not intersect
// the diagonal. We need this separate scalar because it's possible
// that the alpha attached to B was reset, if it was applied during
// packing.
buf_alpha2 = bli_obj_internal_scalar_buffer( c );
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_exec];
// Invoke the function.
f( diagoffb,
schema_a,
schema_b,
m,
n,
k,
buf_alpha1,
buf_a, cs_a, pd_a, ps_a,
buf_b, rs_b, pd_b, ps_b,
buf_alpha2,
buf_c, rs_c, cs_c,
cntx,
rntm,
thread );
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
( \
doff_t diagoffb, \
pack_t schema_a, \
pack_t schema_b, \
dim_t m, \
dim_t n, \
dim_t k, \
void* alpha1, \
void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
void* alpha2, \
void* c, inc_t rs_c, inc_t cs_c, \
cntx_t* cntx, \
rntm_t* rntm, \
thrinfo_t* thread \
) \
{ \
const num_t dt = PASTEMAC(ch,type); \
\
/* Alias some constants to simpler names. */ \
const dim_t MR = pd_a; \
const dim_t NR = pd_b; \
const dim_t PACKMR = cs_a; \
const dim_t PACKNR = rs_b; \
\
/* Cast the micro-kernel address to its function pointer type. */ \
/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
the current macro-kernel targets the "ru" case (right-side/upper-
triangular), it becomes lower-triangular after the kernel operation
is transposed so that all kernel instances are of the "left"
variety (since those are the only trsm ukernels that exist). */ \
PASTECH(ch,gemmtrsm_ukr_ft) \
gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
PASTECH(ch,gemm_ukr_ft) \
gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
\
/* Temporary C buffer for edge cases. Note that the strides of this
temporary buffer are set so that they match the storage of the
original C matrix. For example, if C is column-stored, ct will be
column-stored as well. */ \
ctype ct[ BLIS_STACK_BUF_MAX_SIZE \
/ sizeof( ctype ) ] \
__attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
const inc_t rs_ct = ( col_pref ? 1 : NR ); \
const inc_t cs_ct = ( col_pref ? MR : 1 ); \
\
ctype* restrict zero = PASTEMAC(ch,0); \
ctype* restrict minus_one = PASTEMAC(ch,m1); \
ctype* restrict a_cast = a; \
ctype* restrict b_cast = b; \
ctype* restrict c_cast = c; \
ctype* restrict alpha1_cast = alpha1; \
ctype* restrict alpha2_cast = alpha2; \
ctype* restrict b1; \
ctype* restrict c1; \
\
doff_t diagoffb_j; \
dim_t k_full; \
dim_t m_iter, m_left; \
dim_t n_iter, n_left; \
dim_t m_cur; \
dim_t n_cur; \
dim_t k_b0111; \
dim_t k_b01; \
dim_t off_b01; \
dim_t off_b11; \
dim_t i, j; \
inc_t rstep_a; \
inc_t cstep_b; \
inc_t rstep_c, cstep_c; \
inc_t istep_a; \
inc_t istep_b; \
inc_t off_scl; \
inc_t ss_b_num; \
inc_t ss_b_den; \
inc_t ps_b_cur; \
inc_t is_b_cur; \
auxinfo_t aux; \
\
/*
Assumptions/assertions:
rs_a == 1
cs_a == PACKNR
pd_a == NR
ps_a == stride to next micro-panel of A
rs_b == PACKMR
cs_b == 1
pd_b == MR
ps_b == stride to next micro-panel of B
rs_c == (no assumptions)
cs_c == (no assumptions)
Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the
swapping of values in the control tree (ie: those values used when
packing). This swapping is needed since we cast right-hand trsm in
terms of transposed left-hand trsm. So, if we're going to be
transposing the operation, then A needs to be packed with NR and B
needs to be packed with MR (remember: B is the triangular matrix in
the right-hand side parameter case).
*/ \
\
/* Safety trap: Certain indexing within this macro-kernel does not
work as intended if both MR and NR are odd. */ \
if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
\
/* If any dimension is zero, return immediately. */ \
if ( bli_zero_dim3( m, n, k ) ) return; \
\
/* Safeguard: If the current panel of B is entirely below its diagonal,
it is implicitly zero. So we do nothing. */ \
if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
\
/* Compute k_full as k inflated up to a multiple of NR. This is
needed because some parameter combinations of trsm reduce k
to advance past zero regions in the triangular matrix, and
when computing the imaginary stride of B (the non-triangular
matrix), which is used by 4m1/3m1 implementations, we need
this unreduced value of k. */ \
k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
\
/* Compute indexing scaling factor for for 4m or 3m. This is
needed because one of the packing register blocksizes (PACKMR
or PACKNR) is used to index into the micro-panels of the non-
triangular matrix when computing with a diagonal-intersecting
micro-panel of the triangular matrix. In the case of 4m or 3m,
real values are stored in both sub-panels, and so the indexing
needs to occur in units of real values. The value computed
here is divided into the complex pointer offset to cause the
pointer to be advanced by the correct value. */ \
if ( bli_is_4mi_packed( schema_b ) || \
bli_is_3mi_packed( schema_b ) || \
bli_is_rih_packed( schema_b ) ) off_scl = 2; \
else off_scl = 1; \
\
/* Compute the storage stride scaling. Usually this is just 1.
However, in the case of interleaved 3m, we need to scale the
offset by 3/2. Note that real-only, imag-only, and summed-only
packing formats are not applicable here since trsm is a two-
operand operation only (unlike trmm, which is capable of three-
operand). */ \
if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \
else { ss_b_num = 1; ss_b_den = 1; } \
\
/* If there is a zero region to the left of where the diagonal of B
intersects the top edge of the panel, adjust the pointer to C and
treat this case as if the diagonal offset were zero. This skips over
the region that was not packed. (Note we assume the diagonal offset
is a multiple of MR; this assumption will hold as long as the cache
blocksizes are each a multiple of MR and NR.) */ \
if ( diagoffb > 0 ) \
{ \
j = diagoffb; \
n = n - j; \
diagoffb = 0; \
c_cast = c_cast + (j )*cs_c; \
} \
\
/* If there is a zero region below where the diagonal of B intersects the
right side of the block, shrink it to prevent "no-op" iterations from
executing. */ \
if ( -diagoffb + n < k ) \
{ \
k = -diagoffb + n; \
} \
\
/* Check the k dimension, which needs to be a multiple of NR. If k
isn't a multiple of NR, we adjust it higher to satisfy the micro-
kernel, which is expecting to perform an NR x NR triangular solve.
This adjustment of k is consistent with what happened when B was
packed: all of its bottom/right edges were zero-padded, and
furthermore, the panel that stores the bottom-right corner of the
matrix has its diagonal extended into the zero-padded region (as
identity). This allows the trsm of that bottom-right panel to
proceed without producing any infs or NaNs that would infect the
"good" values of the corresponding block of A. */ \
if ( k % NR != 0 ) k += NR - ( k % NR ); \
\
/* NOTE: We don't need to check that n is a multiple of PACKNR since we
know that the underlying buffer was already allocated to have an n
dimension that is a multiple of PACKNR, with the region between the
last column and the next multiple of NR zero-padded accordingly. */ \
\
/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
PASTEMAC(ch,set0s_mxn)( MR, NR, \
ct, rs_ct, cs_ct ); \
\
/* Compute number of primary and leftover components of the m and n
dimensions. */ \
n_iter = n / NR; \
n_left = n % NR; \
\
m_iter = m / MR; \
m_left = m % MR; \
\
if ( n_left ) ++n_iter; \
if ( m_left ) ++m_iter; \
\
/* Determine some increments used to step through A, B, and C. */ \
rstep_a = ps_a; \
\
cstep_b = ps_b; \
\
rstep_c = rs_c * MR; \
cstep_c = cs_c * NR; \
\
istep_a = PACKMR * k_full; \
istep_b = PACKNR * k; \
\
if ( bli_is_odd( istep_a ) ) istep_a += 1; \
if ( bli_is_odd( istep_b ) ) istep_b += 1; \
\
/* Save the pack schemas of A and B to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_schema_a( schema_b, &aux ); \
bli_auxinfo_set_schema_b( schema_a, &aux ); \
\
/* Save the imaginary stride of A to the auxinfo_t object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_b( istep_a, &aux ); \
\
b1 = b_cast; \
c1 = c_cast; \
\
/* Loop over the n dimension (NR columns at a time). */ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype* restrict a1; \
ctype* restrict c11; \
ctype* restrict b01; \
ctype* restrict b11; \
ctype* restrict b2; \
\
diagoffb_j = diagoffb - ( doff_t )j*NR; \
a1 = a_cast; \
c11 = c1; \
\
n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
\
/* Initialize our next panel of B to be the current panel of B. */ \
b2 = b1; \
\
/* If the current panel of B intersects the diagonal, use a
special micro-kernel that performs a fused gemm and trsm.
If the current panel of B resides above the diagonal, use a
a regular gemm micro-kernel. Otherwise, if it is below the
diagonal, it was not packed (because it is implicitly zero)
and so we do nothing. */ \
if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Determine the offset to and length of the panel that was packed
so we can index into the corresponding location in A. */ \
off_b01 = 0; \
k_b0111 = bli_min( k, -diagoffb_j + NR ); \
k_b01 = k_b0111 - NR; \
off_b11 = k_b01; \
\
/* Compute the addresses of the panel B10 and the triangular
block B11. */ \
b01 = b1; \
/* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \
b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \
\
/* Compute the panel stride for the current micro-panel. */ \
is_b_cur = k_b0111 * PACKNR; \
is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \
\
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( is_b_cur, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a10; \
ctype* restrict a11; \
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the A10 panel and A11 block. */ \
a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \
a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + ps_b_cur; \
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b01, \
alpha1_cast, \
b01, \
b11, \
a10, \
a11, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the fused gemm/trsm micro-kernel. */ \
gemmtrsm_ukr \
( \
k_b01, \
alpha1_cast, \
b01, \
b11, \
a10, \
a11, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Copy the result to the bottom edge of C. */ \
PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += ps_b_cur; \
} \
else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
{ \
/* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t
object.
NOTE: We swap the values for A and B since the triangular
"A" matrix is actually contained within B. */ \
bli_auxinfo_set_is_a( istep_b, &aux ); \
\
/* Loop over the m dimension (MR rows at a time). */ \
for ( i = 0; i < m_iter; ++i ) \
{ \
if( bli_trsm_my_iter( i, thread ) ){ \
\
ctype* restrict a2; \
\
m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
\
/* Compute the addresses of the next panels of A and B. */ \
a2 = a1; \
/*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\
if ( i + bli_thread_num_threads(thread) >= m_iter ) \
{ \
a2 = a_cast; \
b2 = b1 + cstep_b; \
if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \
b2 = b_cast; \
} \
\
/* Save addresses of next panels of A and B to the auxinfo_t
object. NOTE: We swap the values for A and B since the
triangular "A" matrix is actually contained within B. */ \
bli_auxinfo_set_next_a( b2, &aux ); \
bli_auxinfo_set_next_b( a2, &aux ); \
\
/* Handle interior and edge cases separately. */ \
if ( m_cur == MR && n_cur == NR ) \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
alpha2_cast, \
c11, cs_c, rs_c, \
&aux, \
cntx \
); \
} \
else \
{ \
/* Invoke the gemm micro-kernel. */ \
gemm_ukr \
( \
k, \
minus_one, \
b1, \
a1, \
zero, \
ct, cs_ct, rs_ct, \
&aux, \
cntx \
); \
\
/* Add the result to the edge of C. */ \
PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \
ct, rs_ct, cs_ct, \
alpha2_cast, \
c11, rs_c, cs_c ); \
} \
} \
\
a1 += rstep_a; \
c11 += rstep_c; \
} \
\
b1 += cstep_b; \
} \
\
c1 += cstep_c; \
} \
}
INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
cython-blis-1.0.0/blis/_src/frame/base/ 0000775 0000000 0000000 00000000000 14634250137 0017633 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/bli_apool.c 0000664 0000000 0000000 00000043642 14634250137 0021750 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_apool_init
(
apool_t* restrict apool
)
{
err_t r_val;
// NOTE: The apool_t is only used in one place; it is the type used to
// define the sba. We've switched to static initialization of the mutex
// field to remove one more thing that could possibly go wrong during
// library initialization.
// Query the mutex from the apool_t.
//bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool );
// Initialize the mutex.
//*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
//bli_pthread_mutex_init( mutex, NULL );
// We choose to start with:
// - an empty pool
// - an initial block_ptrs_len of 8
// - a single element in each initial array_t (though this is moot with
// num_blocks = 0).
const siz_t num_blocks = 0;
siz_t block_ptrs_len = 8;
const siz_t num_elem = 1;
// NOTE: Unlike in the bli_pool API, apool_t allocates block_ptrs as an
// array of array_t* instead of an array of pblk_t. Why? We don't need to
// track the size of each block, thus we don't need the block_size field
// of pblk_t. That leaves only the void* field, and since we know apool_t
// will always contain "blocks" that are really array_t structs, we can
// make block_ptrs an array of array_t*.
// We formally set the block_size and align_size fields of the underlying
// pool, even though they won't be queried. (They are used from hard-coded
// values in bli_apool_alloc_block().)
const siz_t block_size = sizeof( array_t );
const siz_t align_size = 64;
// Query the underlying pool_t from the apool_t.
pool_t* restrict pool = bli_apool_pool( apool );
// Set the default array_t length of the apool_t.
bli_apool_set_def_array_len( num_elem, apool );
// -------------------------------------------------------------------------
// Make sure that block_ptrs_len is at least num_blocks.
block_ptrs_len = bli_max( block_ptrs_len, num_blocks );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_init(): allocating block_ptrs (length %d): ",
( int )block_ptrs_len );
#endif
// Allocate the block_ptrs array.
array_t** restrict block_ptrs
=
bli_malloc_intl( block_ptrs_len * sizeof( array_t* ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_init(): allocating %d array_t.\n", ( int )num_blocks );
fflush( stdout );
#endif
// Allocate and initialize each entry in the block_ptrs array.
for ( dim_t i = 0; i < num_blocks; ++i )
{
// Pass in num_elem so the function knows how many elements to
// initially have in each array_t.
bli_apool_alloc_block
(
num_elem,
&(block_ptrs[i])
);
}
// NOTE: The semantics of top_index approximate a stack, where a "full"
// stack (no blocks checked out) is one where top_index == 0 and an empty
// stack (all blocks checked out) one where top_index == num_blocks.
// (Here, num_blocks tracks the number of blocks currently allocated as
// part of the pool.) This "orientation" of the stack was chosen
// intentionally, in contrast to one where top_index == -1 means the
// stack is empty and top_index = num_blocks - 1 means the stack is
// full. The chosen scheme allows one to conceptualize the stack as a
// number line in which blocks are checked out from lowest to highest,
// and additional blocks are added at the higher end.
// Initialize the pool_t structure.
// NOTE: We don't use the malloc_fp and free_fp fields at the apool_t
// level. Nevertheless, we set them to NULL.
bli_pool_set_block_ptrs( block_ptrs, pool );
bli_pool_set_block_ptrs_len( block_ptrs_len, pool );
bli_pool_set_top_index( 0, pool );
bli_pool_set_num_blocks( num_blocks, pool );
bli_pool_set_block_size( block_size, pool );
bli_pool_set_align_size( align_size, pool );
bli_pool_set_malloc_fp( NULL, pool );
bli_pool_set_free_fp( NULL, pool );
}
void bli_apool_alloc_block
(
siz_t num_elem,
array_t** restrict array_p
)
{
err_t r_val;
// Since the apool_t is defined as a pool of array_t, we can hard-code
// the block_size parameter.
const siz_t block_size = sizeof( array_t );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_alloc_block(): allocating array_t: " );
#endif
// Allocate the array_t via the bli_fmalloc_align() wrapper, which performs
// alignment logic and opaquely saves the original pointer so that it can
// be recovered when it's time to free the block.
array_t* restrict array
=
bli_malloc_intl( block_size, &r_val );
// Initialize an array_t struct within the newly allocated memory region.
bli_array_init( num_elem, sizeof( pool_t* ), array );
// Save the pointer in the caller's array_t*.
*array_p = array;
}
void bli_apool_free_block
(
array_t* restrict array
)
{
const siz_t num_elem = bli_array_num_elem( array );
pool_t** restrict buf = bli_array_buf( array );
// Step through the array and finalize each pool_t.
for ( dim_t i = 0; i < num_elem; ++i )
{
pool_t* restrict pool = buf[ i ];
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_free_block(): freeing pool_t %d within array_t.\n",
( int )i );
fflush( stdout );
#endif
// Finalize and free the current pool_t, if it was created/allocated.
if ( pool != NULL )
{
// Finalize the pool.
bli_pool_finalize( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_free_block(): pool_t %d: ", ( int )i );
#endif
// Free the pool_t struct.
bli_free_intl( pool );
}
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_free_block(): " );
#endif
// Free the array buffer.
bli_array_finalize( array );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_free_block(): freeing array_t: " );
#endif
// Free the array.
bli_free_intl( array );
}
void bli_apool_finalize
(
apool_t* restrict apool
)
{
// NOTE: Since the apool_t's mutex is now initialized statically, we no
// longer need to explicitly destroy it.
// Query the mutex from the apool_t.
//bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool );
// Destroy the mutex.
//bli_pthread_mutex_destroy( mutex );
// Query the underlying pool_t and mutex from the apool_t.
pool_t* restrict pool = bli_apool_pool( apool );
// ----------------------------------------------------------------
// Query the block_ptrs array.
array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the total number of blocks currently allocated.
siz_t num_blocks = bli_pool_num_blocks( pool );
// Query the top_index of the pool.
siz_t top_index = bli_pool_top_index( pool );
// Sanity check: The top_index should be zero.
if ( top_index != 0 ) bli_abort();
// Free the individual blocks (each an array_t) currently in the pool.
for ( dim_t i = 0; i < num_blocks; ++i )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_finalize(): freeing array_t %d within apool_t.\n",
( int )i );
fflush( stdout );
#endif
bli_apool_free_block( block_ptrs[i] );
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_finalize(): freeing block_ptrs (length %d): ",
( int )( bli_pool_block_ptrs_len( pool ) ) );
#endif
// Free the block_ptrs array.
bli_free_intl( block_ptrs );
}
array_t* bli_apool_checkout_array
(
siz_t n_threads,
apool_t* restrict apool
)
{
// Acquire the apool_t's mutex.
bli_apool_lock( apool );
// ----------------------------------------------------------------------------
// NOTE: Unlike with the bli_pool API, we do not need to handle potential
// reinitialization since the apool_t's block_size (corresponding to the
// size of an array_t struct) will never grow.
// If the apool_t is exhausted, add a block (e.g. an array_t).
if ( bli_apool_is_exhausted( apool ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_checkout_block(): apool_t is exhausted; "
"growing by 1 array_t.\n" );
fflush( stdout );
#endif
bli_apool_grow( 1, apool );
}
// At this point, at least one array_t is guaranteed to be available.
// Query the underlying pool_t from the apool_t.
pool_t* restrict pool = bli_apool_pool( apool );
// Query the block_ptrs array.
array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_checkout_array(): checking out array_t %d.\n",
( int )top_index );
fflush( stdout );
#endif
// Select the array_t* at top_index to return to the caller.
array_t* restrict array = block_ptrs[ top_index ];
// Increment the pool's top_index.
bli_pool_set_top_index( top_index + 1, pool );
// ----------------------------------------------------------------------------
// Release the apool_t's mutex.
bli_apool_unlock( apool );
// Resize the array_t according to the number of threads specified by the
// caller. (We need one element in the array_t per thread.)
bli_array_resize( n_threads, array );
// Return the selected array_t*.
return array;
}
void bli_apool_checkin_array
(
array_t* restrict array,
apool_t* restrict apool
)
{
// Acquire the apool_t's mutex.
bli_apool_lock( apool );
// Query the underlying pool_t from the apool_t.
pool_t* restrict pool = bli_apool_pool( apool );
// ----------------------------------------------------------------------------
// NOTE: Unlike with the bli_pool API, we do not need to handle potential
// freeing of the blocks upon checkin due to the block_size having since
// changed due to reinitialization since the apool's block_size will never
// change.
// Query the block_ptrs array.
array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_checkin_block(): checking in array_t %d.\n",
( int )top_index - 1 );
fflush( stdout );
#endif
// Copy the caller's array_t address to the element at top_index - 1.
block_ptrs[ top_index - 1 ] = array;
// Decrement the pool's top_index.
bli_pool_set_top_index( top_index - 1, pool );
// ----------------------------------------------------------------------------
// Release the apool_t's mutex.
bli_apool_unlock( apool );
}
pool_t* bli_apool_array_elem
(
siz_t index,
array_t* restrict array
)
{
err_t r_val;
// Query the array element corresponding to index.
// NOTE: If we knew that the array_t contained elements of size
// sizeof( void* ) or sizeof( whatever ), we could return the *value*
// stored in the array. But since array_t is general-purpose, it can't
// return the element itself. So instead, bli_array_elem() returns the
// address of the element in the array. Since the elements that apool_t
// stores in the array_t are pool_t*, that means that the function is
// actually returning the address of a pool_t*, or pool_t**, hence the
// dereferencing below.
pool_t** restrict pool_p = bli_array_elem( index, array );
pool_t* pool = *pool_p;
// If the element is NULL, then it means a pool_t has not yet been created
// and allocated for the given index (thread id).
if ( pool == NULL )
{
// Settle on the parameters to use when initializing the pool_t for
// the current index within the array_t.
const siz_t num_blocks = 1;
const siz_t block_ptrs_len = 25;
const siz_t align_size = 16;
const siz_t offset_size = 0;
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
free_ft free_fp = BLIS_FREE_POOL;
// Each small block pool should contain blocks large enough to
// accommodate any of the data structures for which they will be
// used.
const siz_t n_sizes = 4;
siz_t sizes[4] = { sizeof( cntl_t ),
sizeof( packm_params_t ),
sizeof( thrcomm_t ),
sizeof( thrinfo_t ) };
siz_t block_size = 0;
// Find the largest of the sizes above and use that as the block_size
// for the pool.
for ( dim_t i = 0; i < n_sizes; ++i )
{
if ( block_size < sizes[i] ) block_size = sizes[i];
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_array_elem(): pool_t for tid %d is NULL; allocating pool_t.\n",
( int )index );
printf( "bli_apool_array_elem(): allocating pool_t: " );
#endif
// Allocate the pool_t.
pool = bli_malloc_intl( sizeof( pool_t ), &r_val );
// Initialize the pool_t.
bli_pool_init
(
num_blocks,
block_ptrs_len,
block_size,
align_size,
offset_size,
malloc_fp,
free_fp,
pool
);
// Update the array element with the address to the new pool_t.
// NOTE: We pass in the address of the pool_t* since the bli_array
// API is generalized for arbitrarily-sized elements, and therefore
// it must always take the address of the data, rather than the
// value (which it can only do if the elem size were fixed).
bli_array_set_elem( &pool, index, array );
}
// The array element is now guaranteed to refer to an allocated and
// initialized pool_t.
// Return the array element.
return pool;
}
void bli_apool_grow
(
siz_t num_blocks_add,
apool_t* restrict apool
)
{
err_t r_val;
// If the requested increase is zero, return early.
if ( num_blocks_add == 0 ) return;
// Query the underlying pool_t from the apool_t.
pool_t* restrict pool = bli_apool_pool( apool );
// Query the default initial array length from the apool_t.
const siz_t num_elem = bli_apool_def_array_len( apool );
// ----------------------------------------------------------------------------
// Query the allocated length of the block_ptrs array and also the
// total number of blocks currently allocated.
const siz_t block_ptrs_len_cur = bli_pool_block_ptrs_len( pool );
const siz_t num_blocks_cur = bli_pool_num_blocks( pool );
// Compute the total number of allocated blocks that will exist
// after we grow the pool.
const siz_t num_blocks_new = num_blocks_cur + num_blocks_add;
// If adding num_blocks_add new blocks will exceed the current capacity
// of the block_ptrs array, we need to first put in place a new (larger)
// array.
if ( block_ptrs_len_cur < num_blocks_new )
{
// To prevent this from happening often, we double the current
// length of the block_ptrs array.
const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur;
// Query the current block_ptrs array.
array_t** restrict block_ptrs_cur = bli_pool_block_ptrs( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_grow(): growing block_ptrs_len (%d -> %d): ",
( int )block_ptrs_len_cur, ( int )block_ptrs_len_new );
#endif
// Allocate a new block_ptrs array.
array_t** restrict block_ptrs_new
=
bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ), &r_val );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
// Copy the contents of the old block_ptrs array to the new/resized
// array. Notice that we can begin with top_index since all entries
// from 0 to top_index-1 have been (and are currently) checked out
// to threads.
for ( dim_t i = top_index; i < num_blocks_cur; ++i )
{
block_ptrs_new[i] = block_ptrs_cur[i];
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_grow(): freeing prev block_ptrs: " );
#endif
// Free the old block_ptrs array.
bli_free_intl( block_ptrs_cur );
// Update the pool_t struct with the new block_ptrs array and
// record its allocated length.
bli_pool_set_block_ptrs( block_ptrs_new, pool );
bli_pool_set_block_ptrs_len( block_ptrs_len_new, pool );
}
// At this point, we are guaranteed to have enough unused elements
// in the block_ptrs array to accommodate an additional num_blocks_add
// blocks.
// Query the current block_ptrs array (which was maybe just resized).
array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_apool_grow(): growing apool_t (%d -> %d).\n",
( int )num_blocks_cur, ( int )num_blocks_new );
fflush( stdout );
#endif
// Allocate the requested additional blocks in the resized array.
for ( dim_t i = num_blocks_cur; i < num_blocks_new; ++i )
{
bli_apool_alloc_block
(
num_elem,
&(block_ptrs[i])
);
}
// Update the pool_t struct with the new number of allocated blocks.
// Notice that top_index remains unchanged, as do the block_size and
// align_size fields.
bli_pool_set_num_blocks( num_blocks_new, pool );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_apool.h 0000664 0000000 0000000 00000006773 14634250137 0021761 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_APOOL_H
#define BLIS_APOOL_H
// -- Locked pool-of-arrays type --
/*
typedef struct
{
bli_pthread_mutex_t mutex;
pool_t pool;
siz_t def_array_len;
} apool_t;
*/
// apool entry query
BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool )
{
return &(apool->pool);
}
BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
{
return &(apool->mutex);
}
BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool )
{
return pool->def_array_len;
}
BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool )
{
pool_t* restrict pool = bli_apool_pool( apool );
return bli_pool_is_exhausted( pool );
}
// apool action
BLIS_INLINE void bli_apool_lock( apool_t* apool )
{
bli_pthread_mutex_lock( bli_apool_mutex( apool ) );
}
BLIS_INLINE void bli_apool_unlock( apool_t* apool )
{
bli_pthread_mutex_unlock( bli_apool_mutex( apool ) );
}
// apool entry modification
BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \
{
pool->def_array_len = def_array_len;
}
// -----------------------------------------------------------------------------
void bli_apool_init
(
apool_t* restrict apool
);
void bli_apool_finalize
(
apool_t* restrict apool
);
array_t* bli_apool_checkout_array
(
siz_t n_threads,
apool_t* restrict apool
);
void bli_apool_checkin_array
(
array_t* restrict array,
apool_t* restrict apool
);
pool_t* bli_apool_array_elem
(
siz_t index,
array_t* restrict array
);
void bli_apool_grow
(
siz_t num_blocks_add,
apool_t* restrict apool
);
void bli_apool_alloc_block
(
siz_t num_elem,
array_t** restrict array_p
);
void bli_apool_free_block
(
array_t* restrict array
);
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_arch.c 0000664 0000000 0000000 00000022125 14634250137 0021544 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef BLIS_CONFIGURETIME_CPUID
// NOTE: If you need to make any changes to this cpp branch, it's probably
// the case that you also need to modify bli_arch.c, bli_cpuid.c, and
// bli_env.c. Don't forget to update these other files as needed!
// The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp
// branch in bli_system.h is processed. (This macro is normally defined in
// bli_config.h.)
#define BLIS_ENABLE_SYSTEM
// Use C-style static inline functions for any static inline functions that
// happen to be defined by the headers below. (This macro is normally defined
// in bli_config_macro_defs.h.)
#define BLIS_INLINE static
// Since we're not building a shared library, we can forgo the use of the
// BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro
// is normally defined in bli_config_macro_defs.h.)
#define BLIS_EXPORT_BLIS
#include "bli_system.h"
#include "bli_type_defs.h"
#include "bli_arch.h"
#include "bli_cpuid.h"
#include "bli_env.h"
#else
#include "blis.h"
#endif
// -----------------------------------------------------------------------------
// The arch_t id for the currently running hardware. We initialize to -1,
// which will be overwritten upon calling bli_arch_set_id().
static arch_t id = -1;
arch_t bli_arch_query_id( void )
{
bli_arch_set_id_once();
// Simply return the id that was previously cached.
return id;
}
// -----------------------------------------------------------------------------
// A pthread structure used in pthread_once(). pthread_once() is guaranteed to
// execute exactly once among all threads that pass in this control object.
static bli_pthread_once_t once_id = BLIS_PTHREAD_ONCE_INIT;
void bli_arch_set_id_once( void )
{
#ifndef BLIS_CONFIGURETIME_CPUID
bli_pthread_once( &once_id, bli_arch_set_id );
#endif
}
// -----------------------------------------------------------------------------
void bli_arch_set_id( void )
{
// Check the environment variable BLIS_ARCH_DEBUG to see if the user
// requested that we echo the result of the subconfiguration selection.
bool do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 );
bli_arch_set_logging( do_logging );
// Check the environment variable BLIS_ARCH_TYPE to see if the user
// requested that we use a specific subconfiguration.
dim_t req_id = bli_env_get_var( "BLIS_ARCH_TYPE", -1 );
#ifndef BLIS_CONFIGURETIME_CPUID
if ( req_id != -1 )
{
// BLIS_ARCH_TYPE was set. Cautiously check whether its value is usable.
// If req_id was set to an invalid arch_t value (ie: outside the range
// [0,BLIS_NUM_ARCHS-1]), output an error message and abort.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_valid_arch_id( req_id );
bli_check_error_code( e_val );
}
// At this point, we know that req_id is in the valid range, but we
// don't yet know if it refers to a context that was actually
// initialized. Query the address of an internal context data structure
// corresponding to req_id. This pointer will be NULL if the associated
// subconfig is not available.
cntx_t** req_cntx = bli_gks_lookup_id( req_id );
// This function checks the context pointer and aborts with a useful
// error message if the pointer is found to be NULL.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_initialized_gks_cntx( req_cntx );
bli_check_error_code( e_val );
}
// Finally, we can be confident that req_id (1) is in range and (2)
// refers to a context that has been initialized.
id = req_id;
}
else
#endif
{
// BLIS_ARCH_TYPE was unset. Proceed with normal subconfiguration
// selection behavior.
// Architecture families.
#if defined BLIS_FAMILY_INTEL64 || \
defined BLIS_FAMILY_AMD64 || \
defined BLIS_FAMILY_X86_64 || \
defined BLIS_FAMILY_ARM64 || \
defined BLIS_FAMILY_ARM32 || \
defined BLIS_FAMILY_X86_64_NO_SKX || \
defined BLIS_FAMILY_X86_64_NO_ZEN2 || \
defined BLIS_FAMILY_X86_64_NO_ZEN3
id = bli_cpuid_query_id();
#endif
// Intel microarchitectures.
#ifdef BLIS_FAMILY_SKX
id = BLIS_ARCH_SKX;
#endif
#ifdef BLIS_FAMILY_KNL
id = BLIS_ARCH_KNL;
#endif
#ifdef BLIS_FAMILY_KNC
id = BLIS_ARCH_KNC;
#endif
#ifdef BLIS_FAMILY_HASWELL
id = BLIS_ARCH_HASWELL;
#endif
#ifdef BLIS_FAMILY_SANDYBRIDGE
id = BLIS_ARCH_SANDYBRIDGE;
#endif
#ifdef BLIS_FAMILY_PENRYN
id = BLIS_ARCH_PENRYN;
#endif
// AMD microarchitectures.
#ifdef BLIS_FAMILY_ZEN3
id = BLIS_ARCH_ZEN3;
#endif
#ifdef BLIS_FAMILY_ZEN2
id = BLIS_ARCH_ZEN2;
#endif
#ifdef BLIS_FAMILY_ZEN
id = BLIS_ARCH_ZEN;
#endif
#ifdef BLIS_FAMILY_EXCAVATOR
id = BLIS_ARCH_EXCAVATOR;
#endif
#ifdef BLIS_FAMILY_STEAMROLLER
id = BLIS_ARCH_STEAMROLLER;
#endif
#ifdef BLIS_FAMILY_PILEDRIVER
id = BLIS_ARCH_PILEDRIVER;
#endif
#ifdef BLIS_FAMILY_BULLDOZER
id = BLIS_ARCH_BULLDOZER;
#endif
// ARM microarchitectures.
#ifdef BLIS_FAMILY_ARMSVE
id = BLIS_ARCH_ARMSVE;
#endif
#ifdef BLIS_FAMILY_A64FX
id = BLIS_ARCH_A64FX;
#endif
#ifdef BLIS_FAMILY_FIRESTORM
id = BLIS_ARCH_FIRESTORM;
#endif
#ifdef BLIS_FAMILY_THUNDERX2
id = BLIS_ARCH_THUNDERX2;
#endif
#ifdef BLIS_FAMILY_CORTEXA57
id = BLIS_ARCH_CORTEXA57;
#endif
#ifdef BLIS_FAMILY_CORTEXA53
id = BLIS_ARCH_CORTEXA53;
#endif
#ifdef BLIS_FAMILY_CORTEXA15
id = BLIS_ARCH_CORTEXA15;
#endif
#ifdef BLIS_FAMILY_CORTEXA9
id = BLIS_ARCH_CORTEXA9;
#endif
// IBM microarchitectures.
#ifdef BLIS_FAMILY_POWER10
id = BLIS_ARCH_POWER10;
#endif
#ifdef BLIS_FAMILY_POWER9
id = BLIS_ARCH_POWER9;
#endif
#ifdef BLIS_FAMILY_POWER7
id = BLIS_ARCH_POWER7;
#endif
#ifdef BLIS_FAMILY_BGQ
id = BLIS_ARCH_BGQ;
#endif
// Generic microarchitecture.
#ifdef BLIS_FAMILY_GENERIC
id = BLIS_ARCH_GENERIC;
#endif
}
if ( bli_arch_get_logging() )
fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n",
bli_arch_string( id ) );
//printf( "blis_arch_query_id(): id = %u\n", id );
//exit(1);
}
// -----------------------------------------------------------------------------
// NOTE: This string array must be kept up-to-date with the arch_t
// enumeration that is typedef'ed in bli_type_defs.h. That is, the
// index order of each string should correspond to the implied/assigned
// enum value given to the corresponding BLIS_ARCH_ value.
static char* config_name[ BLIS_NUM_ARCHS ] =
{
"skx",
"knl",
"knc",
"haswell",
"sandybridge",
"penryn",
"zen3",
"zen2",
"zen",
"excavator",
"steamroller",
"piledriver",
"bulldozer",
"armsve",
"a64fx",
"firestorm",
"thunderx2",
"cortexa57",
"cortexa53",
"cortexa15",
"cortexa9",
"power10",
"power9",
"power7",
"bgq",
"generic"
};
char* bli_arch_string( arch_t id )
{
return config_name[ id ];
}
// -----------------------------------------------------------------------------
static bool arch_dolog = 0;
void bli_arch_set_logging( bool dolog )
{
arch_dolog = dolog;
}
bool bli_arch_get_logging( void )
{
return arch_dolog;
}
void bli_arch_log( char* fmt, ... )
{
char prefix[] = "libblis: ";
int n_chars = strlen( prefix ) + strlen( fmt ) + 1;
if ( bli_arch_get_logging() && fmt )
{
char* prefix_fmt = malloc( n_chars );
snprintf( prefix_fmt, n_chars, "%s%s", prefix, fmt );
va_list ap;
va_start( ap, fmt );
vfprintf( stderr, prefix_fmt, ap );
va_end( ap );
free( prefix_fmt );
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_arch.h 0000664 0000000 0000000 00000003725 14634250137 0021556 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_ARCH_H
#define BLIS_ARCH_H
BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void );
void bli_arch_set_id_once( void );
void bli_arch_set_id( void );
BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id );
void bli_arch_set_logging( bool dolog );
bool bli_arch_get_logging( void );
void bli_arch_log( char*, ... );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_array.c 0000664 0000000 0000000 00000015114 14634250137 0021745 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//#define BLIS_ENABLE_MEM_TRACING
void bli_array_init
(
const siz_t num_elem,
const siz_t elem_size,
array_t* restrict array
)
{
err_t r_val;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_array_init(): allocating array [%d * %d]: ",
( int )num_elem, ( int )elem_size );
#endif
// Compute the total size (in bytes) of the array.
const size_t array_size = num_elem * elem_size;
// Allocate the array buffer.
void* restrict buf = bli_malloc_intl( array_size, &r_val );
// Initialize the array elements to zero. THIS IS IMPORANT because
// consumer threads will use the NULL-ness of the array elements to
// determine if the corresponding block (data structure) needs to be
// created/allocated and initialized.
memset( buf, 0, array_size );
// Initialize the array_t structure.
bli_array_set_buf( buf, array );
bli_array_set_num_elem( num_elem, array );
bli_array_set_elem_size( elem_size, array );
}
void bli_array_resize
(
const siz_t num_elem_new,
array_t* restrict array
)
{
err_t r_val;
// Query the number of elements in the array.
const siz_t num_elem_prev = bli_array_num_elem( array );
// If the new requested size (number of elements) is less than or equal to
// the current size, no action is needed; return early.
if ( num_elem_new <= num_elem_prev ) return;
// At this point, we know that num_elem_prev < num_elem_new, which means
// we need to proceed with the resizing.
// Query the size of each element in the array.
const siz_t elem_size = bli_array_elem_size( array );
// Compute the total size (in bytes) of the array before and after resizing.
const size_t array_size_prev = num_elem_prev * elem_size;
const size_t array_size_new = num_elem_new * elem_size;
// Query the previous array buffer.
void* restrict buf_prev = bli_array_buf( array );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_array_resize(): allocating array [%d * %d]: ",
( int )num_elem_new, ( int )elem_size );
#endif
// Allocate a new array buffer.
char* restrict buf_new = bli_malloc_intl( array_size_new, &r_val );
// Copy the previous array contents to the new array.
memcpy( buf_new, buf_prev, array_size_prev );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_array_resize(): freeing array [%d * %d]: ",
( int )num_elem_prev, ( int )elem_size );
#endif
// Now that the elements have been copied over to the new buffer, we can
// free the previous array buffer.
bli_free_intl( buf_prev );
// Initialize the new elements' contents to zero. (Note that we advance
// the new buffer address by the size of the previous array so that we
// arrive at the first byte of the new segment.)
memset( &buf_new[ array_size_prev ], 0, array_size_new - array_size_prev );
// Update the array_t structure.
// NOTE: The array elem_size field does not need updating.
bli_array_set_buf( buf_new, array );
bli_array_set_num_elem( num_elem_new, array );
}
void bli_array_finalize
(
array_t* restrict array
)
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_array_finalize(): freeing buf (length %d): ",
( int )bli_array_num_elem( array ) );
#endif
// Query the buffer from the array.
void* restrict buf = bli_array_buf( array );
// Free the buffer.
bli_free_intl( buf );
}
void* bli_array_elem
(
const siz_t index,
array_t* restrict array
)
{
// Query the number of elements in the array.
const siz_t num_elem = bli_array_num_elem( array );
// Sanity check: disallow access beyond the bounds of the array.
if ( num_elem <= index ) bli_abort();
// Query the size of each element in the array.
const siz_t elem_size = bli_array_elem_size( array );
// Query the buffer from the array, but store it as a char* so we can use
// it to easily perform byte pointer arithmetic.
char* restrict buf = bli_array_buf( array );
// Advance the pointer by (index * elem_size) bytes.
buf += index * elem_size;
// Return the address of the element computed above.
return ( void* )buf;
}
void bli_array_set_elem
(
void* restrict elem,
const siz_t index,
array_t* restrict array
)
{
// Query the size of each element in the array.
const siz_t elem_size = bli_array_elem_size( array );
// Query the buffer from the array as a char*.
char* restrict buf = bli_array_buf( array );
if ( elem_size == sizeof( void* ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_array_set_elem(): elem_size is %d; setting index %d.\n",
( int )elem_size, ( int )index );
fflush( stdout );
#endif
// Special case: Handle elem_size = sizeof( void* ) without calling
// memcpy().
void** restrict buf_vvp = ( void** )buf;
void** restrict elem_vvp = ( void** )elem;
buf_vvp[ index ] = *elem_vvp;
}
else
{
// General case: Copy the elem_size bytes from elem to buf at the
// element index specified by index.
memcpy( &buf[ index * elem_size ], elem, ( size_t )elem_size );
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_array.h 0000664 0000000 0000000 00000006016 14634250137 0021753 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_ARRAY_H
#define BLIS_ARRAY_H
// -- Array type --
/*
typedef struct
{
void* buf;
siz_t num_elem;
siz_t elem_size;
} array_t;
*/
// Array entry query
BLIS_INLINE void* bli_array_buf( array_t* array )
{
return array->buf;
}
BLIS_INLINE siz_t bli_array_num_elem( array_t* array )
{
return array->num_elem;
}
BLIS_INLINE siz_t bli_array_elem_size( array_t* array )
{
return array->elem_size;
}
// Array entry modification
BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \
{
array->buf = buf;
}
BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \
{
array->num_elem = num_elem;
}
BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \
{
array->elem_size = elem_size;
}
// -----------------------------------------------------------------------------
void bli_array_init
(
const siz_t num_elem,
const siz_t elem_size,
array_t* restrict array
);
void bli_array_resize
(
const siz_t num_elem_new,
array_t* restrict array
);
void bli_array_finalize
(
array_t* restrict array
);
void* bli_array_elem
(
const siz_t index,
array_t* restrict array
);
void bli_array_set_elem
(
void* restrict elem,
const siz_t index,
array_t* restrict array
);
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_auxinfo.h 0000664 0000000 0000000 00000007047 14634250137 0022313 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_AUXINFO_MACRO_DEFS_H
#define BLIS_AUXINFO_MACRO_DEFS_H
// auxinfo_t field query
BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai )
{
return ai->schema_a;
}
BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai )
{
return ai->schema_b;
}
BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai )
{
return ai->a_next;
}
BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai )
{
return ai->b_next;
}
BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai )
{
return ai->is_a;
}
BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai )
{
return ai->is_b;
}
BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai )
{
return ai->ps_a;
}
BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai )
{
return ai->ps_b;
}
BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai )
{
return ai->ukr;
}
BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai )
{
return ai->params;
}
// auxinfo_t field modification
BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai )
{
ai->schema_a = schema;
}
BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai )
{
ai->schema_b = schema;
}
BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai )
{
ai->a_next = p;
}
BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai )
{
ai->b_next = p;
}
BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai )
{
ai->a_next = ap;
ai->b_next = bp;
}
BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai )
{
ai->is_a = is;
}
BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai )
{
ai->is_b = is;
}
BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai )
{
ai->ps_a = ps;
}
BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai )
{
ai->ps_b = ps;
}
BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai )
{
ai->ukr = ukr;
}
BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai )
{
ai->params = params;
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_blksz.c 0000664 0000000 0000000 00000024513 14634250137 0021757 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
blksz_t* bli_blksz_create_ed
(
dim_t b_s, dim_t be_s,
dim_t b_d, dim_t be_d,
dim_t b_c, dim_t be_c,
dim_t b_z, dim_t be_z
)
{
err_t r_val;
blksz_t* b = bli_malloc_intl( sizeof( blksz_t ), &r_val );
bli_blksz_init_ed
(
b,
b_s, be_s,
b_d, be_d,
b_c, be_c,
b_z, be_z
);
return b;
}
blksz_t* bli_blksz_create
(
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
)
{
err_t r_val;
blksz_t* b = bli_malloc_intl( sizeof( blksz_t ), &r_val );
bli_blksz_init
(
b,
b_s, b_d, b_c, b_z,
be_s, be_d, be_c, be_z
);
return b;
}
void bli_blksz_init_ed
(
blksz_t* b,
dim_t b_s, dim_t be_s,
dim_t b_d, dim_t be_d,
dim_t b_c, dim_t be_c,
dim_t b_z, dim_t be_z
)
{
b->v[BLIS_FLOAT] = b_s;
b->v[BLIS_DOUBLE] = b_d;
b->v[BLIS_SCOMPLEX] = b_c;
b->v[BLIS_DCOMPLEX] = b_z;
b->e[BLIS_FLOAT] = be_s;
b->e[BLIS_DOUBLE] = be_d;
b->e[BLIS_SCOMPLEX] = be_c;
b->e[BLIS_DCOMPLEX] = be_z;
}
void bli_blksz_init
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
)
{
b->v[BLIS_FLOAT] = b_s;
b->v[BLIS_DOUBLE] = b_d;
b->v[BLIS_SCOMPLEX] = b_c;
b->v[BLIS_DCOMPLEX] = b_z;
b->e[BLIS_FLOAT] = be_s;
b->e[BLIS_DOUBLE] = be_d;
b->e[BLIS_SCOMPLEX] = be_c;
b->e[BLIS_DCOMPLEX] = be_z;
}
void bli_blksz_init_easy
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z
)
{
b->v[BLIS_FLOAT] = b->e[BLIS_FLOAT] = b_s;
b->v[BLIS_DOUBLE] = b->e[BLIS_DOUBLE] = b_d;
b->v[BLIS_SCOMPLEX] = b->e[BLIS_SCOMPLEX] = b_c;
b->v[BLIS_DCOMPLEX] = b->e[BLIS_DCOMPLEX] = b_z;
}
void bli_blksz_free
(
blksz_t* b
)
{
bli_free_intl( b );
}
// -----------------------------------------------------------------------------
#if 0
void bli_blksz_reduce_dt_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
)
{
dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz );
dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz );
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
// If the blocksize multiple is zero, we do nothing.
if ( bmult_val == 0 ) return;
// Round the default and maximum blocksize values down to their
// respective nearest multiples of bmult_val. (Notice that we
// ignore the "max" entry in the bmult object since that would
// correspond to the packing dimension, which plays no role
// as a blocksize multiple.)
blksz_def = ( blksz_def / bmult_val ) * bmult_val;
blksz_max = ( blksz_max / bmult_val ) * bmult_val;
// Make sure the new blocksize values are at least the blocksize
// multiple.
if ( blksz_def == 0 ) blksz_def = bmult_val;
if ( blksz_max == 0 ) blksz_max = bmult_val;
// Store the new blocksizes back to the object.
bli_blksz_set_def( blksz_def, dt_bs, blksz );
bli_blksz_set_max( blksz_max, dt_bs, blksz );
}
#endif
// -----------------------------------------------------------------------------
void bli_blksz_reduce_def_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
)
{
dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz );
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
// If the blocksize multiple is zero, we do nothing.
if ( bmult_val == 0 ) return;
// Round the default and maximum blocksize values down to their
// respective nearest multiples of bmult_val. (Notice that we
// ignore the "max" entry in the bmult object since that would
// correspond to the packing dimension, which plays no role
// as a blocksize multiple.)
blksz_def = ( blksz_def / bmult_val ) * bmult_val;
// Make sure the new blocksize values are at least the blocksize
// multiple.
if ( blksz_def == 0 ) blksz_def = bmult_val;
// Store the new blocksizes back to the object.
bli_blksz_set_def( blksz_def, dt_bs, blksz );
}
// -----------------------------------------------------------------------------
void bli_blksz_reduce_max_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
)
{
dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz );
dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult );
// If the blocksize multiple is zero, we do nothing.
if ( bmult_val == 0 ) return;
// Round the blocksize values down to its nearest multiple of
// of bmult_val. (Notice that we ignore the "max" entry in the
// bmult object since that would correspond to the packing
// dimension, which plays no role as a blocksize multiple.)
blksz_max = ( blksz_max / bmult_val ) * bmult_val;
// Make sure the new blocksize value is at least the blocksize
// multiple.
if ( blksz_max == 0 ) blksz_max = bmult_val;
// Store the new blocksize back to the object.
bli_blksz_set_max( blksz_max, dt_bs, blksz );
}
// -----------------------------------------------------------------------------
dim_t bli_determine_blocksize
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
if ( direct == BLIS_FWD )
return bli_determine_blocksize_f( i, dim, obj, bszid, cntx );
else
return bli_determine_blocksize_b( i, dim, obj, bszid, cntx );
}
dim_t bli_determine_blocksize_f
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
num_t dt;
blksz_t* bsize;
dim_t b_alg, b_max;
dim_t b_use;
// Extract the execution datatype and use it to query the corresponding
// blocksize and blocksize maximum values from the blksz_t object.
dt = bli_obj_exec_dt( obj );
bsize = bli_cntx_get_blksz( bszid, cntx );
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );
b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );
return b_use;
}
dim_t bli_determine_blocksize_b
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
num_t dt;
blksz_t* bsize;
dim_t b_alg, b_max;
dim_t b_use;
// Extract the execution datatype and use it to query the corresponding
// blocksize and blocksize maximum values from the blksz_t object.
dt = bli_obj_exec_dt( obj );
bsize = bli_cntx_get_blksz( bszid, cntx );
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );
b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );
return b_use;
}
dim_t bli_determine_blocksize_f_sub
(
dim_t i,
dim_t dim,
dim_t b_alg,
dim_t b_max
)
{
dim_t b_now;
dim_t dim_left_now;
// We assume that this function is being called from an algorithm that
// is moving "forward" (ie: top to bottom, left to right, top-left
// to bottom-right).
// Compute how much of the matrix dimension is left, including the
// chunk that will correspond to the blocksize we are computing now.
dim_left_now = dim - i;
// If the dimension currently remaining is less than the maximum
// blocksize, use it instead of the default blocksize b_alg.
// Otherwise, use b_alg.
if ( dim_left_now <= b_max )
{
b_now = dim_left_now;
}
else
{
b_now = b_alg;
}
return b_now;
}
dim_t bli_determine_blocksize_b_sub
(
dim_t i,
dim_t dim,
dim_t b_alg,
dim_t b_max
)
{
dim_t b_now;
dim_t dim_left_now;
dim_t dim_at_edge;
// We assume that this function is being called from an algorithm that
// is moving "backward" (ie: bottom to top, right to left, bottom-right
// to top-left).
// Compute how much of the matrix dimension is left, including the
// chunk that will correspond to the blocksize we are computing now.
dim_left_now = dim - i;
// Sanity check: if dim_left_now is zero, then we can return zero
// without going any further.
if ( dim_left_now == 0 )
return 0;
dim_at_edge = dim_left_now % b_alg;
// If dim_left_now is a multiple of b_alg, we can safely return b_alg
// without going any further.
if ( dim_at_edge == 0 )
return b_alg;
// If the dimension currently remaining is less than the maximum
// blocksize, use it as the chosen blocksize. If this is not the case,
// then we know dim_left_now is greater than the maximum blocksize.
// To determine how much of it we should use for the current blocksize,
// we inspect dim_at_edge; if it is smaller than (or equal to) b_max -
// b_alg, then we use b_alg + dim_at_edge. Otherwise, dim_at_edge is
// greater than b_max - b_alg, in which case we use dim_at_edge.
if ( dim_left_now <= b_max )
{
b_now = dim_left_now;
}
else // if ( dim_left_now > b_max )
{
if ( dim_at_edge <= b_max - b_alg )
{
b_now = b_alg + dim_at_edge;
}
else // if ( dim_at_edge > b_max - b_alg )
{
b_now = dim_at_edge;
}
}
return b_now;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_blksz.h 0000664 0000000 0000000 00000016056 14634250137 0021767 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// blksz_t query
BLIS_INLINE dim_t bli_blksz_get_def
(
num_t dt,
blksz_t* b
)
{
return b->v[ dt ];
}
BLIS_INLINE dim_t bli_blksz_get_max
(
num_t dt,
blksz_t* b
)
{
return b->e[ dt ];
}
// blksz_t modification
BLIS_INLINE void bli_blksz_set_def
(
dim_t val,
num_t dt,
blksz_t* b
)
{
b->v[ dt ] = val;
}
BLIS_INLINE void bli_blksz_set_max
(
dim_t val,
num_t dt,
blksz_t* b
)
{
b->e[ dt ] = val;
}
BLIS_INLINE void bli_blksz_copy
(
blksz_t* b_src,
blksz_t* b_dst
)
{
*b_dst = *b_src;
}
BLIS_INLINE void bli_blksz_copy_if_pos
(
blksz_t* b_src,
blksz_t* b_dst
)
{
// Copy the blocksize values over to b_dst one-by-one so that
// we can skip the ones that are non-positive.
const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src );
const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src );
const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src );
const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src );
const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src );
const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src );
const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src );
const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src );
if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst );
if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst );
if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst );
if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst );
if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst );
if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst );
if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst );
if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst );
}
BLIS_INLINE void bli_blksz_copy_def_dt
(
num_t dt_src, blksz_t* b_src,
num_t dt_dst, blksz_t* b_dst
)
{
const dim_t val = bli_blksz_get_def( dt_src, b_src );
bli_blksz_set_def( val, dt_dst, b_dst );
}
BLIS_INLINE void bli_blksz_copy_max_dt
(
num_t dt_src, blksz_t* b_src,
num_t dt_dst, blksz_t* b_dst
)
{
const dim_t val = bli_blksz_get_max( dt_src, b_src );
bli_blksz_set_max( val, dt_dst, b_dst );
}
BLIS_INLINE void bli_blksz_copy_dt
(
num_t dt_src, blksz_t* b_src,
num_t dt_dst, blksz_t* b_dst
)
{
bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst );
bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst );
}
BLIS_INLINE void bli_blksz_scale_def
(
dim_t num,
dim_t den,
num_t dt,
blksz_t* b
)
{
const dim_t val = bli_blksz_get_def( dt, b );
bli_blksz_set_def( ( val * num ) / den, dt, b );
}
BLIS_INLINE void bli_blksz_scale_max
(
dim_t num,
dim_t den,
num_t dt,
blksz_t* b
)
{
const dim_t val = bli_blksz_get_max( dt, b );
bli_blksz_set_max( ( val * num ) / den, dt, b );
}
BLIS_INLINE void bli_blksz_scale_def_max
(
dim_t num,
dim_t den,
num_t dt,
blksz_t* b
)
{
bli_blksz_scale_def( num, den, dt, b );
bli_blksz_scale_max( num, den, dt, b );
}
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed
(
dim_t b_s, dim_t be_s,
dim_t b_d, dim_t be_d,
dim_t b_c, dim_t be_c,
dim_t b_z, dim_t be_z
);
BLIS_EXPORT_BLIS blksz_t* bli_blksz_create
(
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
);
BLIS_EXPORT_BLIS void bli_blksz_init_ed
(
blksz_t* b,
dim_t b_s, dim_t be_s,
dim_t b_d, dim_t be_d,
dim_t b_c, dim_t be_c,
dim_t b_z, dim_t be_z
);
BLIS_EXPORT_BLIS void bli_blksz_init
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z,
dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z
);
BLIS_EXPORT_BLIS void bli_blksz_init_easy
(
blksz_t* b,
dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z
);
BLIS_EXPORT_BLIS void bli_blksz_free
(
blksz_t* b
);
// -----------------------------------------------------------------------------
#if 0
BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
#endif
void bli_blksz_reduce_def_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
void bli_blksz_reduce_max_to
(
num_t dt_bm, blksz_t* bmult,
num_t dt_bs, blksz_t* blksz
);
// -----------------------------------------------------------------------------
dim_t bli_determine_blocksize
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);
dim_t bli_determine_blocksize_f
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);
dim_t bli_determine_blocksize_b
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);
dim_t bli_determine_blocksize_f_sub
(
dim_t i,
dim_t dim,
dim_t b_alg,
dim_t b_max
);
dim_t bli_determine_blocksize_b_sub
(
dim_t i,
dim_t dim,
dim_t b_alg,
dim_t b_max
);
cython-blis-1.0.0/blis/_src/frame/base/bli_check.c 0000664 0000000 0000000 00000053212 14634250137 0021705 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -- General stuff ------------------------------------------------------------
err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line )
{
if ( code == BLIS_SUCCESS ) return code;
if ( BLIS_ERROR_CODE_MAX < code && code < BLIS_ERROR_CODE_MIN )
{
bli_print_msg( bli_error_string_for_code( code ),
file, line );
bli_abort();
}
else
{
bli_print_msg( bli_error_string_for_code( BLIS_UNDEFINED_ERROR_CODE ),
file, line );
bli_abort();
}
return code;
}
err_t bli_check_valid_error_level( errlev_t level )
{
err_t e_val = BLIS_SUCCESS;
if ( level != BLIS_NO_ERROR_CHECKING &&
level != BLIS_FULL_ERROR_CHECKING )
e_val = BLIS_INVALID_ERROR_CHECKING_LEVEL;
return e_val;
}
err_t bli_check_null_pointer( void* ptr )
{
err_t e_val = BLIS_SUCCESS;
if ( ptr == NULL )
e_val = BLIS_NULL_POINTER;
return e_val;
}
// -- Parameter-related checks -------------------------------------------------
err_t bli_check_valid_side( side_t side )
{
err_t e_val = BLIS_SUCCESS;
if ( side != BLIS_LEFT &&
side != BLIS_RIGHT /*&&
side != BLIS_TOP &&
side != BLIS_BOTTOM*/ )
e_val = BLIS_INVALID_SIDE;
return e_val;
}
err_t bli_check_valid_uplo( uplo_t uplo )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_is_lower( uplo ) &&
!bli_is_upper( uplo ) )
e_val = BLIS_INVALID_UPLO;
return e_val;
}
err_t bli_check_valid_trans( trans_t trans )
{
err_t e_val = BLIS_SUCCESS;
if ( trans != BLIS_NO_TRANSPOSE &&
trans != BLIS_TRANSPOSE &&
trans != BLIS_CONJ_NO_TRANSPOSE &&
trans != BLIS_CONJ_TRANSPOSE )
e_val = BLIS_INVALID_TRANS;
return e_val;
}
err_t bli_check_valid_diag( diag_t diag )
{
err_t e_val = BLIS_SUCCESS;
if ( diag != BLIS_NONUNIT_DIAG &&
diag != BLIS_UNIT_DIAG )
e_val = BLIS_INVALID_DIAG;
return e_val;
}
err_t bli_check_nonunit_diag( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_has_nonunit_diag( a ) )
e_val = BLIS_EXPECTED_NONUNIT_DIAG;
return e_val;
}
// -- Datatype-related checks --------------------------------------------------
err_t bli_check_valid_datatype( num_t dt )
{
err_t e_val = BLIS_SUCCESS;
if ( dt != BLIS_FLOAT &&
dt != BLIS_DOUBLE &&
dt != BLIS_SCOMPLEX &&
dt != BLIS_DCOMPLEX &&
dt != BLIS_INT &&
dt != BLIS_CONSTANT )
e_val = BLIS_INVALID_DATATYPE;
return e_val;
}
err_t bli_check_object_valid_datatype( obj_t* a )
{
err_t e_val;
num_t dt;
dt = bli_obj_dt( a );
e_val = bli_check_valid_datatype( dt );
return e_val;
}
err_t bli_check_noninteger_datatype( num_t dt )
{
err_t e_val = BLIS_SUCCESS;
if ( dt == BLIS_INT )
e_val = BLIS_EXPECTED_NONINTEGER_DATATYPE;
return e_val;
}
err_t bli_check_noninteger_object( obj_t* a )
{
err_t e_val;
num_t dt;
dt = bli_obj_dt( a );
e_val = bli_check_noninteger_datatype( dt );
return e_val;
}
err_t bli_check_nonconstant_datatype( num_t dt )
{
err_t e_val = BLIS_SUCCESS;
if ( dt == BLIS_CONSTANT )
e_val = BLIS_EXPECTED_NONCONSTANT_DATATYPE;
return e_val;
}
err_t bli_check_nonconstant_object( obj_t* a )
{
err_t e_val;
num_t dt;
dt = bli_obj_dt( a );
e_val = bli_check_nonconstant_datatype( dt );
return e_val;
}
err_t bli_check_floating_datatype( num_t dt )
{
err_t e_val = BLIS_SUCCESS;
if ( dt != BLIS_FLOAT &&
dt != BLIS_DOUBLE &&
dt != BLIS_SCOMPLEX &&
dt != BLIS_DCOMPLEX )
e_val = BLIS_EXPECTED_FLOATING_POINT_DATATYPE;
return e_val;
}
err_t bli_check_floating_object( obj_t* a )
{
err_t e_val;
num_t dt;
dt = bli_obj_dt( a );
e_val = bli_check_floating_datatype( dt );
return e_val;
}
err_t bli_check_real_datatype( num_t dt )
{
err_t e_val = BLIS_SUCCESS;
if ( dt != BLIS_FLOAT &&
dt != BLIS_DOUBLE )
e_val = BLIS_EXPECTED_REAL_DATATYPE;
return e_val;
}
err_t bli_check_real_object( obj_t* a )
{
err_t e_val;
num_t dt;
dt = bli_obj_dt( a );
e_val = bli_check_real_datatype( dt );
return e_val;
}
err_t bli_check_integer_datatype( num_t dt )
{
err_t e_val = BLIS_SUCCESS;
if ( dt != BLIS_INT )
e_val = BLIS_EXPECTED_INTEGER_DATATYPE;
return e_val;
}
err_t bli_check_integer_object( obj_t* a )
{
err_t e_val;
num_t dt;
dt = bli_obj_dt( a );
e_val = bli_check_integer_datatype( dt );
return e_val;
}
err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b )
{
err_t e_val = BLIS_SUCCESS;
if ( dt_a != BLIS_CONSTANT &&
dt_b != BLIS_CONSTANT )
if ( dt_a != dt_b )
e_val = BLIS_INCONSISTENT_DATATYPES;
return e_val;
}
err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b )
{
err_t e_val;
num_t dt_a;
num_t dt_b;
dt_a = bli_obj_dt( a );
dt_b = bli_obj_dt( b );
e_val = bli_check_consistent_datatypes( dt_a, dt_b );
return e_val;
}
err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r )
{
err_t e_val = BLIS_SUCCESS;
if ( ( dt_c == BLIS_CONSTANT && bli_is_complex( dt_r ) ) ||
( dt_c == BLIS_FLOAT && dt_r != BLIS_FLOAT ) ||
( dt_c == BLIS_DOUBLE && dt_r != BLIS_DOUBLE ) ||
( dt_c == BLIS_SCOMPLEX && dt_r != BLIS_FLOAT ) ||
( dt_c == BLIS_DCOMPLEX && dt_r != BLIS_DOUBLE ) )
e_val = BLIS_EXPECTED_REAL_PROJ_OF;
return e_val;
}
err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r )
{
err_t e_val;
num_t dt_c;
num_t dt_r;
dt_c = bli_obj_dt( c );
dt_r = bli_obj_dt( r );
e_val = bli_check_datatype_real_proj_of( dt_c, dt_r );
return e_val;
}
err_t bli_check_real_valued_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
double a_real;
double a_imag;
bli_getsc( a, &a_real, &a_imag );
if ( a_imag != 0.0 )
e_val = BLIS_EXPECTED_REAL_VALUED_OBJECT;
return e_val;
}
err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b )
{
err_t e_val = BLIS_SUCCESS;
if ( dt_a == BLIS_FLOAT )
{
if ( dt_b != BLIS_FLOAT &&
dt_b != BLIS_SCOMPLEX )
e_val = BLIS_INCONSISTENT_PRECISIONS;
}
else if ( dt_a == BLIS_DOUBLE )
{
if ( dt_b != BLIS_DOUBLE &&
dt_b != BLIS_DCOMPLEX )
e_val = BLIS_INCONSISTENT_PRECISIONS;
}
return e_val;
}
err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b )
{
err_t e_val;
num_t dt_a;
num_t dt_b;
dt_a = bli_obj_dt( a );
dt_b = bli_obj_dt( b );
e_val = bli_check_consistent_precisions( dt_a, dt_b );
return e_val;
}
// -- Dimension-related checks -------------------------------------------------
err_t bli_check_conformal_dims( obj_t* a, obj_t* b )
{
err_t e_val = BLIS_SUCCESS;
dim_t m_a, n_a;
dim_t m_b, n_b;
m_a = bli_obj_length_after_trans( a );
n_a = bli_obj_width_after_trans( a );
m_b = bli_obj_length_after_trans( b );
n_b = bli_obj_width_after_trans( b );
if ( m_a != m_b || n_a != n_b )
e_val = BLIS_NONCONFORMAL_DIMENSIONS;
return e_val;
}
err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c )
{
err_t e_val = BLIS_SUCCESS;
dim_t m_c, n_c;
dim_t m_a, k_a;
dim_t k_b, n_b;
m_c = bli_obj_length_after_trans( c );
n_c = bli_obj_width_after_trans( c );
m_a = bli_obj_length_after_trans( a );
k_a = bli_obj_width_after_trans( a );
k_b = bli_obj_length_after_trans( b );
n_b = bli_obj_width_after_trans( b );
if ( m_c != m_a ||
n_c != n_b ||
k_a != k_b )
e_val = BLIS_NONCONFORMAL_DIMENSIONS;
return e_val;
}
err_t bli_check_scalar_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_length( a ) < 0 ||
bli_obj_width( a ) < 0 )
return BLIS_NEGATIVE_DIMENSION;
if ( bli_obj_length( a ) != 1 ||
bli_obj_width( a ) != 1 )
return BLIS_EXPECTED_SCALAR_OBJECT;
return e_val;
}
err_t bli_check_vector_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_length( a ) < 0 ||
bli_obj_width( a ) < 0 )
return BLIS_NEGATIVE_DIMENSION;
if ( !bli_obj_is_vector( a ) )
return BLIS_EXPECTED_VECTOR_OBJECT;
return e_val;
}
err_t bli_check_matrix_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_length( a ) < 0 ||
bli_obj_width( a ) < 0 )
e_val = BLIS_NEGATIVE_DIMENSION;
return e_val;
}
err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y )
{
err_t e_val = BLIS_SUCCESS;
dim_t dim_x;
dim_t dim_y;
dim_x = bli_obj_vector_dim( x );
dim_y = bli_obj_vector_dim( y );
if ( dim_x != dim_y )
e_val = BLIS_UNEQUAL_VECTOR_LENGTHS;
return e_val;
}
err_t bli_check_square_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_length( a ) != bli_obj_width( a ) )
e_val = BLIS_EXPECTED_SQUARE_OBJECT;
return e_val;
}
err_t bli_check_object_length_equals( obj_t* a, dim_t m )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_length( a ) != m )
e_val = BLIS_UNEXPECTED_OBJECT_LENGTH;
return e_val;
}
err_t bli_check_object_width_equals( obj_t* a, dim_t n )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_width( a ) != n )
e_val = BLIS_UNEXPECTED_OBJECT_WIDTH;
return e_val;
}
err_t bli_check_vector_dim_equals( obj_t* a, dim_t n )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_vector_dim( a ) != n )
e_val = BLIS_UNEXPECTED_VECTOR_DIM;
return e_val;
}
err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset )
{
err_t e_val = BLIS_SUCCESS;
if ( offset != bli_obj_diag_offset( a ) )
e_val = BLIS_UNEXPECTED_DIAG_OFFSET;
return e_val;
}
// -- Stride-related checks ----------------------------------------------------
err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is )
{
err_t e_val = BLIS_SUCCESS;
// Note: A lot of thought went into designing these checks. Do NOT change
// them unless you absolutely know what you are doing! Particularly, do
// not try to merge the general and row-/column-major sections. It might
// be possible, but it would be a lot less readable.
// Prohibit negative dimensions.
if ( m < 0 || n < 0 )
return BLIS_NEGATIVE_DIMENSION;
// Overwrite rs and cs with the absolute value of each. We can do this
// since the checks below are not dependent on the sign of the strides.
rs = bli_abs( rs );
cs = bli_abs( cs );
is = bli_abs( is );
// The default case (whereby we interpret rs == cs == 0 as a request for
// column-major order) is handled prior to calling this function, so the
// only time we should see zero strides here is if the matrix is empty.
if ( m == 0 || n == 0 ) return e_val;
// Disallow row, column, or imaginary strides of zero.
if ( ( rs == 0 || cs == 0 || is == 0 ) )
return BLIS_INVALID_DIM_STRIDE_COMBINATION;
// Check stride consistency in cases of general stride.
if ( rs != 1 && cs != 1 )
{
// We apply different tests depending on which way the strides
// "tilt".
if ( rs == cs )
{
// If rs == cs, then we must be dealing with an m-by-1 or a
// 1-by-n matrix and thus at least one of the dimensions, m
// or n, must be unit (even if the other is zero).
if ( m != 1 && n != 1 )
return BLIS_INVALID_DIM_STRIDE_COMBINATION;
}
else if ( rs < cs )
{
// For column-major tilt, cs must be equal or larger than m * rs.
if ( m * rs > cs )
return BLIS_INVALID_DIM_STRIDE_COMBINATION;
}
else if ( cs < rs )
{
// For row-major tilt, rs must be equal or larger than n * cs.
if ( n * cs > rs )
return BLIS_INVALID_DIM_STRIDE_COMBINATION;
}
}
else // check stride consistency of row-/column-storage cases.
{
if ( rs == 1 && cs == 1 )
{
// If rs == cs == 1, then we must be dealing with an m-by-1, a
// 1-by-n, or a 1-by-1 matrix and thus at least one of the
// dimensions, m or n, must be unit (even if the other is zero).
if ( m != 1 && n != 1 )
return BLIS_INVALID_DIM_STRIDE_COMBINATION;
}
else if ( rs == 1 )
{
// For column-major storage, don't allow the column stride to be
// less than the m dimension.
if ( cs < m )
return BLIS_INVALID_COL_STRIDE;
}
else if ( cs == 1 )
{
// For row-major storage, don't allow the row stride to be less
// than the n dimension.
if ( rs < n )
return BLIS_INVALID_ROW_STRIDE;
}
}
return e_val;
}
// -- Structure-related checks -------------------------------------------------
err_t bli_check_general_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_is_general( a ) )
e_val = BLIS_EXPECTED_GENERAL_OBJECT;
return e_val;
}
err_t bli_check_hermitian_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_is_hermitian( a ) )
e_val = BLIS_EXPECTED_HERMITIAN_OBJECT;
return e_val;
}
err_t bli_check_symmetric_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_is_symmetric( a ) )
e_val = BLIS_EXPECTED_SYMMETRIC_OBJECT;
return e_val;
}
err_t bli_check_triangular_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_is_triangular( a ) )
e_val = BLIS_EXPECTED_TRIANGULAR_OBJECT;
return e_val;
}
err_t bli_check_object_struc( obj_t* a, struc_t struc )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_is_general( struc ) ) e_val = bli_check_general_object( a );
else if ( bli_is_hermitian( struc ) ) e_val = bli_check_hermitian_object( a );
else if ( bli_is_symmetric( struc ) ) e_val = bli_check_symmetric_object( a );
else if ( bli_is_triangular( struc ) ) e_val = bli_check_triangular_object( a );
return e_val;
}
// -- Storage-related checks ---------------------------------------------------
err_t bli_check_upper_or_lower_object( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_is_lower( a ) &&
!bli_obj_is_upper( a ) )
e_val = BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT;
return e_val;
}
// -- Partitioning-related checks ----------------------------------------------
err_t bli_check_valid_3x1_subpart( subpart_t part )
{
err_t e_val = BLIS_SUCCESS;
if ( part != BLIS_SUBPART0 &&
part != BLIS_SUBPART1AND0 &&
part != BLIS_SUBPART1 &&
part != BLIS_SUBPART1AND2 &&
part != BLIS_SUBPART2 &&
part != BLIS_SUBPART1A &&
part != BLIS_SUBPART1B )
e_val = BLIS_INVALID_3x1_SUBPART;
return e_val;
}
err_t bli_check_valid_1x3_subpart( subpart_t part )
{
err_t e_val = BLIS_SUCCESS;
if ( part != BLIS_SUBPART0 &&
part != BLIS_SUBPART1AND0 &&
part != BLIS_SUBPART1 &&
part != BLIS_SUBPART1AND2 &&
part != BLIS_SUBPART2 &&
part != BLIS_SUBPART1A &&
part != BLIS_SUBPART1B )
e_val = BLIS_INVALID_1x3_SUBPART;
return e_val;
}
err_t bli_check_valid_3x3_subpart( subpart_t part )
{
err_t e_val = BLIS_SUCCESS;
if ( part != BLIS_SUBPART00 &&
part != BLIS_SUBPART10 &&
part != BLIS_SUBPART20 &&
part != BLIS_SUBPART01 &&
part != BLIS_SUBPART11 &&
part != BLIS_SUBPART21 &&
part != BLIS_SUBPART02 &&
part != BLIS_SUBPART12 &&
part != BLIS_SUBPART22 )
e_val = BLIS_INVALID_3x3_SUBPART;
return e_val;
}
// -- Control tree-related checks ----------------------------------------------
err_t bli_check_valid_cntl( void* cntl )
{
err_t e_val = BLIS_SUCCESS;
if ( cntl == NULL )
e_val = BLIS_UNEXPECTED_NULL_CONTROL_TREE;
return e_val;
}
// -- Packing-related checks ---------------------------------------------------
err_t bli_check_packm_schema_on_unpack( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_pack_schema( a ) != BLIS_PACKED_ROWS &&
bli_obj_pack_schema( a ) != BLIS_PACKED_COLUMNS &&
bli_obj_pack_schema( a ) != BLIS_PACKED_ROW_PANELS &&
bli_obj_pack_schema( a ) != BLIS_PACKED_COL_PANELS )
e_val = BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK;
return e_val;
}
err_t bli_check_packv_schema_on_unpack( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_obj_pack_schema( a ) != BLIS_PACKED_VECTOR )
e_val = BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK;
return e_val;
}
// -- Buffer-related checks ----------------------------------------------------
err_t bli_check_object_buffer( obj_t* a )
{
err_t e_val = BLIS_SUCCESS;
// We are only concerned with NULL buffers in objects where BOTH
// dimensions are non-zero.
if ( bli_obj_buffer( a ) == NULL )
if ( bli_obj_length( a ) > 0 && bli_obj_width( a ) > 0 )
e_val = BLIS_EXPECTED_NONNULL_OBJECT_BUFFER;
return e_val;
}
// -- Memory checks ------------------------------------------------------------
err_t bli_check_valid_malloc_buf( void* ptr )
{
err_t e_val = BLIS_SUCCESS;
if ( ptr == NULL )
e_val = BLIS_MALLOC_RETURNED_NULL;
return e_val;
}
// -- Internal memory pool checks ----------------------------------------------
err_t bli_check_valid_packbuf( packbuf_t buf_type )
{
err_t e_val = BLIS_SUCCESS;
if ( buf_type != BLIS_BUFFER_FOR_A_BLOCK &&
buf_type != BLIS_BUFFER_FOR_B_PANEL &&
buf_type != BLIS_BUFFER_FOR_C_PANEL &&
buf_type != BLIS_BUFFER_FOR_GEN_USE )
e_val = BLIS_INVALID_PACKBUF;
return e_val;
}
err_t bli_check_if_exhausted_pool( pool_t* pool )
{
err_t e_val = BLIS_SUCCESS;
if ( bli_pool_is_exhausted( pool ) )
e_val = BLIS_EXHAUSTED_CONTIG_MEMORY_POOL;
return e_val;
}
err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx )
{
err_t e_val = BLIS_SUCCESS;
num_t dt;
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx );
dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx );
siz_t dt_size = bli_dt_size( dt );
// NOTE: For induced methods, we use the size of the complex datatypes
// (rather than the size of the native micro-kernels' datatype) because
// the macro-kernel needs this larger micro-tile footprint, even if the
// virtual micro-kernel implementation will only ever be writing to half
// of it (real or imaginary part) at a time.
if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE )
e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE;
}
return e_val;
}
err_t bli_check_alignment_is_power_of_two( size_t align_size )
{
err_t e_val = BLIS_SUCCESS;
// This function returns an error code if align_size is zero or not
// a power of two.
if ( align_size == 0 )
e_val = BLIS_ALIGNMENT_NOT_POWER_OF_TWO;
else if ( ( align_size & ( align_size - 1 ) ) )
e_val = BLIS_ALIGNMENT_NOT_POWER_OF_TWO;
return e_val;
}
err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size )
{
err_t e_val = BLIS_SUCCESS;
// This function returns an error code if align_size is not a whole
// multiple of the size of a pointer.
if ( align_size % sizeof( void* ) != 0 )
e_val = BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE;
return e_val;
}
// -- Object-related errors ----------------------------------------------------
err_t bli_check_object_alias_of( obj_t* a, obj_t* b )
{
err_t e_val = BLIS_SUCCESS;
if ( !bli_obj_is_alias_of( a, b ) )
e_val = BLIS_EXPECTED_OBJECT_ALIAS;
return e_val;
}
// -- Architecture-related errors ----------------------------------------------
err_t bli_check_valid_arch_id( arch_t id )
{
err_t e_val = BLIS_SUCCESS;
if ( ( gint_t )id < 0 || BLIS_NUM_ARCHS <= ( gint_t )id )
e_val = BLIS_INVALID_ARCH_ID;
return e_val;
}
err_t bli_check_initialized_gks_cntx( cntx_t** cntx )
{
err_t e_val = BLIS_SUCCESS;
if ( cntx == NULL )
e_val = BLIS_UNINITIALIZED_GKS_CNTX;
return e_val;
}
// -- Architecture-related errors ----------------------------------------------
err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr )
{
num_t dt;
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
dim_t mc_def_dt = bli_blksz_get_def( dt, mc );
dim_t mc_max_dt = bli_blksz_get_max( dt, mc );
dim_t mr_dt = bli_blksz_get_def( dt, mr );
if ( mc_def_dt % mr_dt != 0 ) return BLIS_MC_DEF_NONMULTIPLE_OF_MR;
else if ( mc_max_dt % mr_dt != 0 ) return BLIS_MC_MAX_NONMULTIPLE_OF_MR;
}
return BLIS_SUCCESS;
}
err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr )
{
num_t dt;
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
dim_t nc_def_dt = bli_blksz_get_def( dt, nc );
dim_t nc_max_dt = bli_blksz_get_max( dt, nc );
dim_t nr_dt = bli_blksz_get_def( dt, nr );
if ( nc_def_dt % nr_dt != 0 ) return BLIS_NC_DEF_NONMULTIPLE_OF_NR;
else if ( nc_max_dt % nr_dt != 0 ) return BLIS_NC_MAX_NONMULTIPLE_OF_NR;
}
return BLIS_SUCCESS;
}
err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr )
{
num_t dt;
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
dim_t kc_def_dt = bli_blksz_get_def( dt, kc );
dim_t kc_max_dt = bli_blksz_get_max( dt, kc );
dim_t kr_dt = bli_blksz_get_def( dt, kr );
if ( kc_def_dt % kr_dt != 0 ) return BLIS_KC_DEF_NONMULTIPLE_OF_KR;
else if ( kc_max_dt % kr_dt != 0 ) return BLIS_KC_MAX_NONMULTIPLE_OF_KR;
}
return BLIS_SUCCESS;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_check.h 0000664 0000000 0000000 00000012030 14634250137 0021703 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line );
err_t bli_check_valid_error_level( errlev_t level );
err_t bli_check_null_pointer( void* ptr );
err_t bli_check_valid_side( side_t side );
err_t bli_check_valid_uplo( uplo_t uplo );
err_t bli_check_valid_trans( trans_t trans );
err_t bli_check_valid_diag( diag_t diag );
err_t bli_check_nonunit_diag( obj_t* a );
err_t bli_check_valid_datatype( num_t dt );
err_t bli_check_object_valid_datatype( obj_t* a );
err_t bli_check_noninteger_datatype( num_t dt );
err_t bli_check_noninteger_object( obj_t* a );
err_t bli_check_nonconstant_datatype( num_t dt );
err_t bli_check_nonconstant_object( obj_t* a );
err_t bli_check_floating_datatype( num_t dt );
err_t bli_check_floating_object( obj_t* a );
err_t bli_check_real_datatype( num_t dt );
err_t bli_check_real_object( obj_t* a );
err_t bli_check_integer_datatype( num_t dt );
err_t bli_check_integer_object( obj_t* a );
err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b );
err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b );
err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r );
err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r );
err_t bli_check_real_valued_object( obj_t* a );
err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b );
err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b );
err_t bli_check_conformal_dims( obj_t* a, obj_t* b );
err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c );
err_t bli_check_scalar_object( obj_t* a );
err_t bli_check_vector_object( obj_t* a );
err_t bli_check_matrix_object( obj_t* a );
err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y );
err_t bli_check_square_object( obj_t* a );
err_t bli_check_object_length_equals( obj_t* a, dim_t m );
err_t bli_check_object_width_equals( obj_t* a, dim_t n );
err_t bli_check_vector_dim_equals( obj_t* a, dim_t n );
err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset );
err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is );
err_t bli_check_general_object( obj_t* a );
err_t bli_check_hermitian_object( obj_t* a );
err_t bli_check_symmetric_object( obj_t* a );
err_t bli_check_triangular_object( obj_t* a );
err_t bli_check_object_struc( obj_t* a, struc_t struc );
err_t bli_check_upper_or_lower_object( obj_t* a );
err_t bli_check_valid_3x1_subpart( subpart_t part );
err_t bli_check_valid_1x3_subpart( subpart_t part );
err_t bli_check_valid_3x3_subpart( subpart_t part );
err_t bli_check_valid_cntl( void* cntl );
err_t bli_check_packm_schema_on_unpack( obj_t* a );
err_t bli_check_packv_schema_on_unpack( obj_t* a );
err_t bli_check_object_buffer( obj_t* a );
err_t bli_check_valid_malloc_buf( void* ptr );
err_t bli_check_valid_packbuf( packbuf_t buf_type );
err_t bli_check_if_exhausted_pool( pool_t* pool );
err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx );
err_t bli_check_alignment_is_power_of_two( size_t align_size );
err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
err_t bli_check_object_alias_of( obj_t* a, obj_t* b );
err_t bli_check_valid_arch_id( arch_t id );
err_t bli_check_initialized_gks_cntx( cntx_t** cntx );
err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr );
err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr );
err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr );
cython-blis-1.0.0/blis/_src/frame/base/bli_clock.c 0000664 0000000 0000000 00000011012 14634250137 0021713 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static double gtod_ref_time_sec = 0.0;
double bli_clock( void )
{
return bli_clock_helper();
}
double bli_clock_min_diff( double time_min, double time_start )
{
double time_min_prev;
double time_diff;
// Save the old value.
time_min_prev = time_min;
time_diff = bli_clock() - time_start;
time_min = bli_fmin( time_min, time_diff );
// Assume that anything:
// - under or equal to zero,
// - under a nanosecond
// is actually garbled due to the clocks being taken too closely together.
if ( time_min <= 0.0 ) time_min = time_min_prev;
else if ( time_min < 1.0e-9 ) time_min = time_min_prev;
return time_min;
}
#ifdef BLIS_DISABLE_SYSTEM
// --- Begin systemless definitions --------------------------------------------
double bli_clock_helper()
{
return 0.0;
}
// --- End systemless definitions ----------------------------------------------
#else
// --- Begin system definitions ------------------------------------------------
#if BLIS_OS_WINDOWS
// --- Begin Windows build definitions -----------------------------------------
double bli_clock_helper()
{
LARGE_INTEGER clock_freq = {0};
LARGE_INTEGER clock_val;
BOOL r_val;
r_val = QueryPerformanceFrequency( &clock_freq );
if ( r_val == 0 )
{
bli_print_msg( "QueryPerformanceFrequency() failed", __FILE__, __LINE__ );
bli_abort();
}
r_val = QueryPerformanceCounter( &clock_val );
if ( r_val == 0 )
{
bli_print_msg( "QueryPerformanceCounter() failed", __FILE__, __LINE__ );
bli_abort();
}
return ( ( double) clock_val.QuadPart / ( double) clock_freq.QuadPart );
}
// --- End Windows build definitions -------------------------------------------
#elif BLIS_OS_OSX
// --- Begin OSX build definitions -------------------------------------------
double bli_clock_helper()
{
mach_timebase_info_data_t timebase;
mach_timebase_info( &timebase );
uint64_t nsec = mach_absolute_time();
double the_time = (double) nsec * 1.0e-9 * timebase.numer / timebase.denom;
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = the_time;
return the_time - gtod_ref_time_sec;
}
// --- End OSX build definitions ---------------------------------------------
#else
// --- Begin Linux build definitions -------------------------------------------
double bli_clock_helper()
{
double the_time, norm_sec;
struct timespec ts;
clock_gettime( CLOCK_MONOTONIC, &ts );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) ts.tv_sec;
norm_sec = ( double ) ts.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + ts.tv_nsec * 1.0e-9;
return the_time;
}
// --- End Linux build definitions ---------------------------------------------
#endif
// --- End system definitions --------------------------------------------------
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_clock.h 0000664 0000000 0000000 00000003451 14634250137 0021730 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS double bli_clock( void );
BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start );
double bli_clock_helper( void );
cython-blis-1.0.0/blis/_src/frame/base/bli_cntl.c 0000664 0000000 0000000 00000025600 14634250137 0021570 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
cntl_t* bli_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
void* params,
cntl_t* sub_node
)
{
cntl_t* cntl;
mem_t* pack_mem;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntl_create_node(): " );
#endif
// Allocate the cntl_t struct.
cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) );
bli_cntl_set_family( family, cntl );
bli_cntl_set_bszid( bszid, cntl );
bli_cntl_set_var_func( var_func, cntl );
bli_cntl_set_params( params, cntl );
bli_cntl_set_sub_prenode( NULL, cntl );
bli_cntl_set_sub_node( sub_node, cntl );
// Query the address of the node's packed mem_t entry so we can initialize
// key fields (to NULL or 0).
// NOTE: This initialization is important, since it allows threads to
// discern whether blocks have been acquired from the memory allocator.
pack_mem = bli_cntl_pack_mem( cntl );
bli_mem_clear( pack_mem );
return cntl;
}
void bli_cntl_free_node
(
rntm_t* rntm,
cntl_t* cntl
)
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntl_free_node(): " );
#endif
bli_sba_release( rntm, cntl );
}
void bli_cntl_clear_node
(
cntl_t* cntl
)
{
mem_t* pack_mem;
// Clear various fields in the control tree. Clearing these fields
// actually is not needed, but we do it for debugging/completeness.
bli_cntl_set_var_func( NULL, cntl );
bli_cntl_set_params( NULL, cntl );
bli_cntl_set_sub_prenode( NULL, cntl );
bli_cntl_set_sub_node( NULL, cntl );
// Clearing these fields is potentially more important if the control
// tree is cached somewhere and reused.
pack_mem = bli_cntl_pack_mem( cntl );
bli_mem_clear( pack_mem );
}
// -----------------------------------------------------------------------------
void bli_cntl_free
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread );
else bli_cntl_free_wo_thrinfo( rntm, cntl );
}
void bli_cntl_free_w_thrinfo
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
)
{
// Base case: simply return when asked to free NULL nodes.
if ( cntl == NULL ) return;
cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl );
void* cntl_params = bli_cntl_params( cntl );
mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl );
// Don't immediately dereference the prenode and subnode of the thrinfo_t
// node. In some cases, the thrinfo_t tree is not built out all the way,
// perhaps because there are more ways of parallelization than micropanels
// of data in this dimension, or because the problem is small enough that
// there is no gemm subproblem in bli_trsm_blk_var1(). Thus, we start with
// NULL values for these variables and only dereference the fields of the
// thrinfo_t struct if the thrinfo_t exists (ie: is non-NULL). We will also
// have to check the thrinfo_t pointer for NULLness before using it below,
// when checking if we need to free the pack_mem field of the cntl_t node
// (see below).
thrinfo_t* thread_sub_prenode = NULL;
thrinfo_t* thread_sub_node = NULL;
if ( thread != NULL )
{
thread_sub_prenode = bli_thrinfo_sub_prenode( thread );
thread_sub_node = bli_thrinfo_sub_node( thread );
}
// Only recurse into prenode branch if it exists.
if ( cntl_sub_prenode != NULL )
{
// Recursively free all memory associated with the sub-prenode and its
// children.
bli_cntl_free_w_thrinfo( rntm, cntl_sub_prenode, thread_sub_prenode );
}
// Only recurse into the child node if it exists.
if ( cntl_sub_node != NULL )
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node );
}
// Free the current node's params field, if it is non-NULL.
if ( cntl_params != NULL )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntl_free_w_thrinfo(): " );
#endif
bli_sba_release( rntm, cntl_params );
}
// Release the current node's pack mem_t entry back to the memory
// broker from which it originated, but only if the mem_t entry is
// allocated, and only if the current thread is chief for its group.
// Also note that we don't proceed with either of the above tests if
// the thrinfo_t pointer is NULL. (See above for background on when
// this can happen.)
if ( thread != NULL )
if ( bli_thread_am_ochief( thread ) )
if ( bli_mem_is_alloc( cntl_pack_mem ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" );
#endif
bli_pba_release( rntm, cntl_pack_mem );
}
// Free the current node.
bli_cntl_free_node( rntm, cntl );
}
void bli_cntl_free_wo_thrinfo
(
rntm_t* rntm,
cntl_t* cntl
)
{
// Base case: simply return when asked to free NULL nodes.
if ( cntl == NULL ) return;
cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl );
void* cntl_params = bli_cntl_params( cntl );
mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl );
{
// Recursively free all memory associated with the sub-prenode and its
// children.
bli_cntl_free_wo_thrinfo( rntm, cntl_sub_prenode );
}
{
// Recursively free all memory associated with the sub-node and its
// children.
bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node );
}
// Free the current node's params field, if it is non-NULL.
if ( cntl_params != NULL )
{
bli_sba_release( rntm, cntl_params );
}
// Release the current node's pack mem_t entry back to the memory
// broker from which it originated, but only if the mem_t entry is
// allocated.
if ( bli_mem_is_alloc( cntl_pack_mem ) )
{
bli_pba_release( rntm, cntl_pack_mem );
}
// Free the current node.
bli_cntl_free_node( rntm, cntl );
}
// -----------------------------------------------------------------------------
cntl_t* bli_cntl_copy
(
rntm_t* rntm,
cntl_t* cntl
)
{
// Make a copy of the current node. Notice that the source node
// should NOT have any allocated/cached mem_t entries, and that
// bli_cntl_create_node() creates a node with a cleared mem_t
// field.
cntl_t* cntl_copy = bli_cntl_create_node
(
rntm,
bli_cntl_family( cntl ),
bli_cntl_bszid( cntl ),
bli_cntl_var_func( cntl ),
NULL, NULL
);
// Check the params field of the existing control tree; if it's non-NULL,
// copy it.
if ( bli_cntl_params( cntl ) != NULL )
{
// Detect the size of the params struct by reading the first field
// as a uint64_t, and then allocate this many bytes for a new params
// struct.
uint64_t params_size = bli_cntl_params_size( cntl );
void* params_orig = bli_cntl_params( cntl );
void* params_copy = bli_sba_acquire( rntm, ( size_t )params_size );
// Copy the original params struct to the new memory region.
memcpy( params_copy, params_orig, params_size );
// Save the address of the new params struct into the new control
// tree node.
bli_cntl_set_params( params_copy, cntl_copy );
}
// If the sub-prenode exists, copy it recursively.
if ( bli_cntl_sub_prenode( cntl ) != NULL )
{
cntl_t* sub_prenode_copy = bli_cntl_copy
(
rntm,
bli_cntl_sub_prenode( cntl )
);
// Save the address of the new sub-node (sub-tree) to the existing
// node.
bli_cntl_set_sub_prenode( sub_prenode_copy, cntl_copy );
}
// If the sub-node exists, copy it recursively.
if ( bli_cntl_sub_node( cntl ) != NULL )
{
cntl_t* sub_node_copy = bli_cntl_copy
(
rntm,
bli_cntl_sub_node( cntl )
);
// Save the address of the new sub-node (sub-tree) to the existing
// node.
bli_cntl_set_sub_node( sub_node_copy, cntl_copy );
}
// Return the address of the newly created node.
return cntl_copy;
}
void bli_cntl_mark_family
(
opid_t family,
cntl_t* cntl
)
{
// This function sets the family field of all cntl tree nodes that are
// children of cntl. It's used by bli_l3_cntl_create_if() after making
// a copy of a user-given cntl tree, if the user provided one, to mark
// the operation family, which is used to determine appropriate behavior
// by various functions when executing the blocked variants.
// Set the family of the root node.
bli_cntl_set_family( family, cntl );
// Recursively set the family field of the sub-tree rooted at the sub-node,
// if it exists.
if ( bli_cntl_sub_prenode( cntl ) != NULL )
{
bli_cntl_mark_family( family, bli_cntl_sub_prenode( cntl ) );
}
// Recursively set the family field of the sub-tree rooted at the prenode,
// if it exists.
if ( bli_cntl_sub_node( cntl ) != NULL )
{
bli_cntl_mark_family( family, bli_cntl_sub_node( cntl ) );
}
}
// -----------------------------------------------------------------------------
dim_t bli_cntl_calc_num_threads_in
(
rntm_t* rntm,
cntl_t* cntl
)
{
dim_t n_threads_in = 1;
for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) )
{
bszid_t bszid = bli_cntl_bszid( cntl );
dim_t cur_way;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
cur_way = bli_rntm_ways_for( bszid, rntm );
else
cur_way = 1;
n_threads_in *= cur_way;
}
return n_threads_in;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_cntl.h 0000664 0000000 0000000 00000012522 14634250137 0021574 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
// -- Control tree node definition --
struct cntl_s
{
// Basic fields (usually required).
opid_t family;
bszid_t bszid;
void_fp var_func;
struct cntl_s* sub_prenode;
struct cntl_s* sub_node;
// Optional fields (needed only by some operations such as packm).
// NOTE: first field of params must be a uint64_t containing the size
// of the struct.
void* params;
// Internal fields that track "cached" data.
mem_t pack_mem;
};
typedef struct cntl_s cntl_t;
*/
// -- Control tree prototypes --
BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node
(
rntm_t* rntm,
opid_t family,
bszid_t bszid,
void_fp var_func,
void* params,
cntl_t* sub_node
);
BLIS_EXPORT_BLIS void bli_cntl_free_node
(
rntm_t* rntm,
cntl_t* cntl
);
BLIS_EXPORT_BLIS void bli_cntl_clear_node
(
cntl_t* cntl
);
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_cntl_free
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo
(
rntm_t* rntm,
cntl_t* cntl,
thrinfo_t* thread
);
BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo
(
rntm_t* rntm,
cntl_t* cntl
);
BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy
(
rntm_t* rntm,
cntl_t* cntl
);
BLIS_EXPORT_BLIS void bli_cntl_mark_family
(
opid_t family,
cntl_t* cntl
);
// -----------------------------------------------------------------------------
dim_t bli_cntl_calc_num_threads_in
(
rntm_t* rntm,
cntl_t* cntl
);
// -----------------------------------------------------------------------------
// cntl_t query (fields only)
BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl )
{
return cntl->family;
}
BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl )
{
return cntl->bszid;
}
BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl )
{
return cntl->var_func;
}
BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl )
{
return cntl->sub_prenode;
}
BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl )
{
return cntl->sub_node;
}
BLIS_INLINE void* bli_cntl_params( cntl_t* cntl )
{
return cntl->params;
}
BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl )
{
// The first 64 bytes is always the size of the params structure.
return *( ( uint64_t* )(cntl->params) );
}
BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl )
{
return &(cntl->pack_mem);
}
// cntl_t query (complex)
BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl )
{
return ( bool )
( cntl == NULL );
}
BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl )
{
return ( bool )
( bli_cntl_sub_node( cntl ) == NULL );
}
BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl )
{
return ( bool )
( bli_cntl_bszid( cntl ) != BLIS_NO_PART );
}
// cntl_t modification
BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl )
{
cntl->family = family;
}
BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl )
{
cntl->bszid = bszid;
}
BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl )
{
cntl->var_func = var_func;
}
BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl )
{
cntl->sub_prenode = sub_prenode;
}
BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl )
{
cntl->sub_node = sub_node;
}
BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl )
{
cntl->params = params;
}
BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl )
{
cntl->pack_mem = *pack_mem;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_cntx.c 0000664 0000000 0000000 00000144707 14634250137 0021616 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_cntx_clear( cntx_t* cntx )
{
// Fill the entire cntx_t structure with zeros.
memset( ( void* )cntx, 0, sizeof( cntx_t ) );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default blocksizes. It should be called after
// bli_cntx_init_defaults() so that the context begins with default
// blocksizes across all datatypes.
/* Example prototypes:
void bli_cntx_set_blkszs
(
ind_t method = BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
...
cntx_t* cntx
);
void bli_cntx_set_blkszs
(
ind_t method != BLIS_NAT,
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bszid_t* bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_bs );
// Handle native and induced method cases separately.
if ( method == BLIS_NAT )
{
// Process n_bs tuples.
for ( i = 0; i < n_bs; ++i )
{
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the address of the blksz_t object,
// - the bszid_t of the multiple we need to associate with
// the blksz_t object.
bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t );
blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
blkszs[ i ] = blksz;
bmults[ i ] = bm_id;
}
}
else // if induced method execution was indicated
{
// Process n_bs tuples.
for ( i = 0; i < n_bs; ++i )
{
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the address of the blksz_t object,
// - the bszid_t of the multiple we need to associate with
// the blksz_t object,
// - the scalars we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes (for default
// and maximum blocksizes).
bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t );
blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t );
double dsclr = ( double )va_arg( args, double );
double msclr = ( double )va_arg( args, double );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
blkszs[ i ] = blksz;
bmults[ i ] = bm_id;
dsclrs[ i ] = dsclr;
msclrs[ i ] = msclr;
}
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Save the execution type into the context.
bli_cntx_set_method( method, cntx );
// Query the context for the addresses of:
// - the blocksize object array
// - the blocksize multiple array
blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context. Notice that the blksz_t* pointers were saved, rather than
// the objects themselves, but we copy the contents of the objects
// when copying into the context.
// Handle native and induced method cases separately.
if ( method == BLIS_NAT )
{
// Process each blocksize id tuple provided.
for ( i = 0; i < n_bs; ++i )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Copy the blksz_t object contents into the appropriate
// location within the context's blksz_t array. Do the same
// for the blocksize multiple id.
//cntx_blkszs[ bs_id ] = *blksz;
//bli_blksz_copy( blksz, cntx_blksz );
bli_blksz_copy_if_pos( blksz, cntx_blksz );
// Copy the blocksize multiple id into the context.
cntx_bmults[ bs_id ] = bm_id;
}
}
else
{
// Process each blocksize id tuple provided.
for ( i = 0; i < n_bs; ++i )
{
// Read the current blocksize id, blksz_t pointer, blocksize
// multiple id, and blocksize scalar.
bszid_t bs_id = bszids[ i ];
bszid_t bm_id = bmults[ i ];
double dsclr = dsclrs[ i ];
double msclr = msclrs[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Copy the real domain values of the source blksz_t object into
// the context, duplicating into the complex domain fields.
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz );
bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
// If the default blocksize scalar is non-unit, we need to scale
// the complex domain default blocksizes.
if ( dsclr != 1.0 )
{
// Scale the complex domain default blocksize values in the
// blocksize object.
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
}
// Similarly, if the maximum blocksize scalar is non-unit, we need
// to scale the complex domain maximum blocksizes.
if ( msclr != 1.0 )
{
// Scale the complex domain maximum blocksize values in the
// blocksize object.
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
}
// Copy the blocksize multiple id into the context.
cntx_bmults[ bs_id ] = bm_id;
}
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( blkszs );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( bszids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( bmults );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( dsclrs );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( msclrs );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
{
/* Example prototypes:
void bli_gks_cntx_set_ind_blkszs
(
ind_t method != BLIS_NAT,
num_t dt,
dim_t n_bs,
bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2,
...
cntx_t* cntx
);
NOTE: This function modifies an existing context that is presumed
to have been initialized for native execution.
*/
va_list args;
dim_t i;
err_t r_val;
// Project the given datatype to the real domain. This will be used later on.
num_t dt_real = bli_dt_proj_to_real( dt );
// Return early if called with BLIS_NAT.
if ( method == BLIS_NAT ) return;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_ind_blkszs(): " );
#endif
bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_ind_blkszs(): " );
#endif
double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_ind_blkszs(): " );
#endif
double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_bs );
{
// Process n_bs tuples.
for ( i = 0; i < n_bs; ++i )
{
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the scalars we wish to apply to the real blocksizes to
// come up with the induced complex blocksizes (for default
// and maximum blocksizes).
bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t );
double dsclr = ( double )va_arg( args, double );
double msclr = ( double )va_arg( args, double );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
dsclrs[ i ] = dsclr;
msclrs[ i ] = msclr;
}
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Save the execution type into the context.
bli_cntx_set_method( method, cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
{
// Process each blocksize id tuple provided.
for ( i = 0; i < n_bs; ++i )
{
// Read the current blocksize id, blocksize multiple id,
// and blocksize scalar.
bszid_t bs_id = bszids[ i ];
double dsclr = dsclrs[ i ];
double msclr = msclrs[ i ];
//blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
// Query the context for the blksz_t object assoicated with the
// current blocksize id, and also query the object corresponding
// to the blocksize multiple.
blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
// Copy the real domain value of the blksz_t object into the
// corresponding complex domain slot of the same object.
bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
// If the default blocksize scalar is non-unit, we need to scale
// the complex domain default blocksizes.
if ( dsclr != 1.0 )
{
// Scale the default blocksize value corresponding to the given
// datatype.
bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
}
// Similarly, if the maximum blocksize scalar is non-unit, we need
// to scale the complex domain maximum blocksizes.
if ( msclr != 1.0 )
{
// Scale the maximum blocksize value corresponding to the given
// datatype.
bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
}
}
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_ind_blkszs(): " );
#endif
bli_free_intl( bszids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_ind_blkszs(): " );
#endif
bli_free_intl( dsclrs );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_ind_blkszs(): " );
#endif
bli_free_intl( msclrs );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-3 microkernels. It should be called after
// bli_cntx_init_defaults() so that the context begins with default
// microkernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_l3_nat_ukrs
(
dim_t n_ukrs,
l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, bool pref0,
l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, bool pref1,
l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, bool pref2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_ukrs );
// Process n_ukrs tuples.
for ( i = 0; i < n_ukrs; ++i )
{
// Here, we query the variable argument list for:
// - the l3ukr_t of the kernel we're about to process,
// - the datatype of the kernel,
// - the kernel function pointer, and
// - the kernel function storage preference
// that we need to store to the context.
// NOTE: Though bool_t is no longer used, the following comment is
// being kept for historical reasons.
// The type that we pass into the va_arg() macro for the ukr
// preference matters. Using 'bool_t' may cause breakage on 64-bit
// systems that define int as 32 bits and long int and pointers as
// 64 bits. The problem is that TRUE or FALSE are defined as 1 and
// 0, respectively, and when "passed" into the variadic function
// they come with no contextual typecast. Thus, default rules of
// argument promotion kick in to treat these integer literals as
// being of type int. Thus, we need to let va_arg() treat the TRUE
// or FALSE value as an int, even if we cast it to and store it
// within a bool_t afterwards.
const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t );
const num_t ukr_dt = ( num_t )va_arg( args, num_t );
void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
const bool ukr_pref = ( bool )va_arg( args, int );
// Store the values in our temporary arrays.
ukr_ids[ i ] = ukr_id;
ukr_dts[ i ] = ukr_dt;
ukr_fps[ i ] = ukr_fp;
ukr_prefs[ i ] = ukr_pref;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the l3 virtual ukernel func_t array
// - the l3 native ukernel func_t array
// - the l3 native ukernel preferences array
func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx );
mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_ukrs; ++i )
{
// Read the current ukernel id, ukernel datatype, ukernel function
// pointer, and ukernel preference.
const l3ukr_t ukr_id = ukr_ids[ i ];
const num_t ukr_dt = ukr_dts[ i ];
void_fp ukr_fp = ukr_fps[ i ];
const bool ukr_pref = ukr_prefs[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ];
func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ];
mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ];
// Store the ukernel function pointer and preference values into
// the context. Notice that we redundantly store the native
// ukernel address in both the native and virtual ukernel slots
// in the context. This is standard practice when creating a
// native context. (Induced method contexts will overwrite the
// virtual function pointer with the address of the appropriate
// virtual ukernel.)
bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
bli_free_intl( ukr_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
bli_free_intl( ukr_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
bli_free_intl( ukr_fps );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_nat_ukrs(): " );
#endif
bli_free_intl( ukr_prefs );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-3 virtual microkernels. It should be called after
// bli_cntx_init_defaults() so that the context begins with default
// microkernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_l3_vir_ukrs
(
dim_t n_ukrs,
l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_ukrs );
// Process n_ukrs tuples.
for ( i = 0; i < n_ukrs; ++i )
{
// Here, we query the variable argument list for:
// - the l3ukr_t of the kernel we're about to process,
// - the datatype of the kernel, and
// - the kernel function pointer.
// that we need to store to the context.
const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t );
const num_t ukr_dt = ( num_t )va_arg( args, num_t );
void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
// Store the values in our temporary arrays.
ukr_ids[ i ] = ukr_id;
ukr_dts[ i ] = ukr_dt;
ukr_fps[ i ] = ukr_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the l3 virtual ukernel func_t array
func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_ukrs; ++i )
{
// Read the current ukernel id, ukernel datatype, ukernel function
// pointer, and ukernel preference.
const l3ukr_t ukr_id = ukr_ids[ i ];
const num_t ukr_dt = ukr_dts[ i ];
void_fp ukr_fp = ukr_fps[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ];
// Store the ukernel function pointer and preference values into
// the context. Notice that we redundantly store the native
// ukernel address in both the native and virtual ukernel slots
// in the context. This is standard practice when creating a
// native context. (Induced method contexts will overwrite the
// virtual function pointer with the address of the appropriate
// virtual ukernel.)
bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
bli_free_intl( ukr_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
bli_free_intl( ukr_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_vir_ukrs(): " );
#endif
bli_free_intl( ukr_fps );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default thresholds for small/unpacked matrix handling. It should
// be called after bli_cntx_init_defaults() so that the context begins
// with default thresholds.
/* Example prototypes:
void bli_cntx_set_l3_sup_thresh
(
dim_t n_thresh,
threshid_t th0_id, blksz_t* blksz0,
threshid_t th1_id, blksz_t* blksz1,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_thresh(): " );
#endif
threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_thresh(): " );
#endif
blksz_t** threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_thresh );
// Process n_thresh tuples.
for ( i = 0; i < n_thresh; ++i )
{
// Here, we query the variable argument list for:
// - the threshid_t of the threshold we're about to process,
// - the address of the blksz_t object,
threshid_t th_id = ( threshid_t )va_arg( args, threshid_t );
blksz_t* thresh = ( blksz_t* )va_arg( args, blksz_t* );
// Store the values in our temporary arrays.
threshids[ i ] = th_id;
threshs[ i ] = thresh;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the threshold array
blksz_t* cntx_threshs = bli_cntx_l3_sup_thresh_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context. Notice that the blksz_t* pointers were saved, rather than
// the objects themselves, but we copy the contents of the objects
// when copying into the context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_thresh; ++i )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
threshid_t th_id = threshids[ i ];
blksz_t* thresh = threshs[ i ];
blksz_t* cntx_thresh = &cntx_threshs[ th_id ];
// Copy the blksz_t object contents into the appropriate
// location within the context's blksz_t array.
//cntx_threshs[ th_id ] = *thresh;
//bli_blksz_copy( thresh, cntx_thresh );
bli_blksz_copy_if_pos( thresh, cntx_thresh );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_thresh(): " );
#endif
bli_free_intl( threshs );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_thresh(): " );
#endif
bli_free_intl( threshids );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-3 operation handler for small/unpacked matrices. It
// should be called after bli_cntx_init_defaults() so that the context
// begins with default sup handlers across all datatypes.
/* Example prototypes:
void bli_cntx_set_l3_sup_handlers
(
dim_t n_ops,
opid_t op0_id, void* handler0_fp,
opid_t op1_id, void* handler1_fp,
opid_t op2_id, void* handler2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_handlers(): " );
#endif
opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_handlers(): " );
#endif
void** op_fps = bli_malloc_intl( n_ops * sizeof( void* ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_ops );
// Process n_ukrs tuples.
for ( i = 0; i < n_ops; ++i )
{
// Here, we query the variable argument list for:
// - the opid_t of the operation we're about to process,
// - the sup handler function pointer
// that we need to store to the context.
const opid_t op_id = ( opid_t )va_arg( args, opid_t );
void* op_fp = ( void* )va_arg( args, void* );
// Store the values in our temporary arrays.
op_ids[ i ] = op_id;
op_fps[ i ] = op_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the l3 small/unpacked handlers array
void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each operation id tuple provided.
for ( i = 0; i < n_ops; ++i )
{
// Read the current operation id and handler function pointer.
const opid_t op_id = op_ids[ i ];
void* op_fp = op_fps[ i ];
// Store the sup handler function pointer into the slot for the
// specified operation id.
cntx_l3_sup_handlers[ op_id ] = op_fp;
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_handlers(): " );
#endif
bli_free_intl( op_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_handlers(): " );
#endif
bli_free_intl( op_fps );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default l3 sup blocksizes. It should be called after
// bli_cntx_init_defaults() so that the context begins with default
// blocksizes across all datatypes.
/* Example prototypes:
void bli_cntx_set_blkszs
(
dim_t n_bs,
bszid_t bs0_id, blksz_t* blksz0,
bszid_t bs1_id, blksz_t* blksz1,
bszid_t bs2_id, blksz_t* blksz2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_bs );
// Process n_bs tuples.
for ( i = 0; i < n_bs; ++i )
{
// Here, we query the variable argument list for:
// - the bszid_t of the blocksize we're about to process,
// - the address of the blksz_t object.
bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t );
blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
// Store the values in our temporary arrays.
bszids[ i ] = bs_id;
blkszs[ i ] = blksz;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the blocksize object array
blksz_t* cntx_l3_sup_blkszs = bli_cntx_l3_sup_blkszs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context. Notice that the blksz_t* pointers were saved, rather than
// the objects themselves, but we copy the contents of the objects
// when copying into the context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_bs; ++i )
{
// Read the current blocksize id, blksz_t* pointer, blocksize
// multiple id, and blocksize scalar.
bszid_t bs_id = bszids[ i ];
blksz_t* blksz = blkszs[ i ];
blksz_t* cntx_l3_sup_blksz = &cntx_l3_sup_blkszs[ bs_id ];
// Copy the blksz_t object contents into the appropriate
// location within the context's blksz_t array.
//cntx_l3_sup_blkszs[ bs_id ] = *blksz;
//bli_blksz_copy( blksz, cntx_l3_sup_blksz );
bli_blksz_copy_if_pos( blksz, cntx_l3_sup_blksz );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( blkszs );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_blkszs(): " );
#endif
bli_free_intl( bszids );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-3 microkernels for small/unpacked matrices. It
// should be called after bli_cntx_init_defaults() so that the context
// begins with default sup micro/millikernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_l3_sup_kers
(
dim_t n_ukrs,
stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0,
stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1,
stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_ukrs );
// Process n_ukrs tuples.
for ( i = 0; i < n_ukrs; ++i )
{
// Here, we query the variable argument list for:
// - the stor3_t storage case being assigned to the kernel we're
// about to process,
// - the datatype of the kernel,
// - the kernel function pointer, and
// - the kernel function storage preference
// that we need to store to the context.
const stor3_t st3_id = ( stor3_t )va_arg( args, stor3_t );
const num_t ukr_dt = ( num_t )va_arg( args, num_t );
void* ukr_fp = ( void* )va_arg( args, void* );
const bool ukr_pref = ( bool )va_arg( args, int );
// Store the values in our temporary arrays.
st3_ids[ i ] = st3_id;
ukr_dts[ i ] = ukr_dt;
ukr_fps[ i ] = ukr_fp;
ukr_prefs[ i ] = ukr_pref;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the addresses of:
// - the l3 small/unpacked ukernel func_t array
// - the l3 small/unpacked ukernel preferences array
func_t* cntx_l3_sup_kers = bli_cntx_l3_sup_kers_buf( cntx );
mbool_t* cntx_l3_sup_kers_prefs = bli_cntx_l3_sup_kers_prefs_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
#if 0
dim_t sup_map[ BLIS_NUM_LEVEL3_SUP_UKRS ][2];
// Create the small/unpacked ukernel mappings:
// - rv -> rrr 0, rcr 2
// - rg -> rrc 1, rcc 3
// - cv -> ccr 6, ccc 7
// - cg -> crr 4, crc 5
// - rd -> rrc 1
// - cd -> crc 5
// - rc -> rcc 3
// - cr -> crr 4
// - gx -> xxx 8
// NOTE: We only need to set one slot in the context l3_sup_kers array
// for the general-stride/generic ukernel type, but since the loop below
// needs to be set up to set two slots to accommodate the RV, RG, CV, and
// CG, ukernel types, we will just be okay with the GX ukernel being set
// redundantly. (The RD, CD, CR, and RC ukernel types are set redundantly
// for the same reason.)
sup_map[ BLIS_GEMMSUP_RV_UKR ][0] = BLIS_RRR;
sup_map[ BLIS_GEMMSUP_RV_UKR ][1] = BLIS_RCR;
sup_map[ BLIS_GEMMSUP_RG_UKR ][0] = BLIS_RRC;
sup_map[ BLIS_GEMMSUP_RG_UKR ][1] = BLIS_RCC;
sup_map[ BLIS_GEMMSUP_CV_UKR ][0] = BLIS_CCR;
sup_map[ BLIS_GEMMSUP_CV_UKR ][1] = BLIS_CCC;
sup_map[ BLIS_GEMMSUP_CG_UKR ][0] = BLIS_CRR;
sup_map[ BLIS_GEMMSUP_CG_UKR ][1] = BLIS_CRC;
sup_map[ BLIS_GEMMSUP_RD_UKR ][0] = BLIS_RRC;
sup_map[ BLIS_GEMMSUP_RD_UKR ][1] = BLIS_RRC;
sup_map[ BLIS_GEMMSUP_CD_UKR ][0] = BLIS_CRC;
sup_map[ BLIS_GEMMSUP_CD_UKR ][1] = BLIS_CRC;
sup_map[ BLIS_GEMMSUP_RC_UKR ][0] = BLIS_RCC;
sup_map[ BLIS_GEMMSUP_RC_UKR ][1] = BLIS_RCC;
sup_map[ BLIS_GEMMSUP_CR_UKR ][0] = BLIS_CRR;
sup_map[ BLIS_GEMMSUP_CR_UKR ][1] = BLIS_CRR;
sup_map[ BLIS_GEMMSUP_GX_UKR ][0] = BLIS_XXX;
sup_map[ BLIS_GEMMSUP_GX_UKR ][1] = BLIS_XXX;
#endif
// Process each blocksize id tuple provided.
for ( i = 0; i < n_ukrs; ++i )
{
// Read the current stor3_t id, ukernel datatype, ukernel function
// pointer, and ukernel preference.
const stor3_t st3_id = st3_ids[ i ];
const num_t ukr_dt = ukr_dts[ i ];
void* ukr_fp = ukr_fps[ i ];
const bool ukr_pref = ukr_prefs[ i ];
// Index to the func_t and mbool_t for the current stor3_t id
// being processed.
func_t* ukrs = &cntx_l3_sup_kers[ st3_id ];
mbool_t* prefs = &cntx_l3_sup_kers_prefs[ st3_id ];
// Store the ukernel function pointer and preference values into
// the stor3_t location in the context.
bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
bli_free_intl( st3_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
bli_free_intl( ukr_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
bli_free_intl( ukr_fps );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l3_sup_kers(): " );
#endif
bli_free_intl( ukr_prefs );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l1f_kers( dim_t n_kers, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-1f kernels. It should be called after
// bli_cntx_init_defaults() so that the context begins with default l1f
// kernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_l1f_kers
(
dim_t n_ukrs,
l1fkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
l1fkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
l1fkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1f_kers(): " );
#endif
l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1f_kers(): " );
#endif
num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1f_kers(): " );
#endif
void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_kers );
// Process n_kers tuples.
for ( i = 0; i < n_kers; ++i )
{
// Here, we query the variable argument list for:
// - the l1fkr_t of the kernel we're about to process,
// - the datatype of the kernel, and
// - the kernel function pointer
// that we need to store to the context.
const l1fkr_t ker_id = ( l1fkr_t )va_arg( args, l1fkr_t );
const num_t ker_dt = ( num_t )va_arg( args, num_t );
void_fp ker_fp = ( void_fp )va_arg( args, void_fp );
// Store the values in our temporary arrays.
ker_ids[ i ] = ker_id;
ker_dts[ i ] = ker_dt;
ker_fps[ i ] = ker_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the address of:
// - the level-1f kernels func_t array
func_t* cntx_l1f_kers = bli_cntx_l1f_kers_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_kers; ++i )
{
// Read the current kernel id, kernel datatype, and kernel function
// pointer.
const l1fkr_t ker_id = ker_ids[ i ];
const num_t ker_dt = ker_dts[ i ];
void_fp ker_fp = ker_fps[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* kers = &cntx_l1f_kers[ ker_id ];
// Store the ukernel function pointer and preference values into
// the context.
bli_func_set_dt( ker_fp, ker_dt, kers );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1f_kers(): " );
#endif
bli_free_intl( ker_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1f_kers(): " );
#endif
bli_free_intl( ker_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1f_kers(): " );
#endif
bli_free_intl( ker_fps );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_l1v_kers( dim_t n_kers, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default level-1v kernels. It should be called after
// bli_cntx_init_defaults() so that the context begins with default l1v
// kernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_l1v_kers
(
dim_t n_ukrs,
l1vkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
l1vkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
l1vkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1v_kers(): " );
#endif
l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1v_kers(): " );
#endif
num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1v_kers(): " );
#endif
void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_kers );
// Process n_kers tuples.
for ( i = 0; i < n_kers; ++i )
{
// Here, we query the variable argument list for:
// - the l1vkr_t of the kernel we're about to process,
// - the datatype of the kernel, and
// - the kernel function pointer
// that we need to store to the context.
const l1vkr_t ker_id = ( l1vkr_t )va_arg( args, l1vkr_t );
const num_t ker_dt = ( num_t )va_arg( args, num_t );
void_fp ker_fp = ( void_fp )va_arg( args, void_fp );
// Store the values in our temporary arrays.
ker_ids[ i ] = ker_id;
ker_dts[ i ] = ker_dt;
ker_fps[ i ] = ker_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the address of:
// - the level-1v kernels func_t array
func_t* cntx_l1v_kers = bli_cntx_l1v_kers_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_kers; ++i )
{
// Read the current kernel id, kernel datatype, and kernel function
// pointer.
const l1vkr_t ker_id = ker_ids[ i ];
const num_t ker_dt = ker_dts[ i ];
void_fp ker_fp = ker_fps[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* kers = &cntx_l1v_kers[ ker_id ];
// Store the ukernel function pointer and preference values into
// the context.
bli_func_set_dt( ker_fp, ker_dt, kers );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1v_kers(): " );
#endif
bli_free_intl( ker_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1v_kers(): " );
#endif
bli_free_intl( ker_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_l1v_kers(): " );
#endif
bli_free_intl( ker_fps );
}
// -----------------------------------------------------------------------------
void bli_cntx_set_packm_kers( dim_t n_kers, ... )
{
// This function can be called from the bli_cntx_init_*() function for
// a particular architecture if the kernel developer wishes to use
// non-default packing kernels. It should be called after
// bli_cntx_init_defaults() so that the context begins with default packm
// kernels across all datatypes.
/* Example prototypes:
void bli_cntx_set_packm_kers
(
dim_t n_ukrs,
l1mkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
l1mkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
l1mkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
...
cntx_t* cntx
);
*/
va_list args;
dim_t i;
err_t r_val;
// Allocate some temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_packm_kers(): " );
#endif
l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_packm_kers(): " );
#endif
num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_packm_kers(): " );
#endif
void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
// -- Begin variable argument section --
// Initialize variable argument environment.
va_start( args, n_kers );
// Process n_kers tuples.
for ( i = 0; i < n_kers; ++i )
{
// Here, we query the variable argument list for:
// - the l1mkr_t of the kernel we're about to process,
// - the datatype of the kernel, and
// - the kernel function pointer
// that we need to store to the context.
const l1mkr_t ker_id = ( l1mkr_t )va_arg( args, l1mkr_t );
const num_t ker_dt = ( num_t )va_arg( args, num_t );
void_fp ker_fp = ( void_fp )va_arg( args, void_fp );
// Store the values in our temporary arrays.
ker_ids[ i ] = ker_id;
ker_dts[ i ] = ker_dt;
ker_fps[ i ] = ker_fp;
}
// The last argument should be the context pointer.
cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
// Shutdown variable argument environment and clean up stack.
va_end( args );
// -- End variable argument section --
// Query the context for the address of:
// - the packm kernels func_t array
func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx );
// Now that we have the context address, we want to copy the values
// from the temporary buffers into the corresponding buffers in the
// context.
// Process each blocksize id tuple provided.
for ( i = 0; i < n_kers; ++i )
{
// Read the current kernel id, kernel datatype, and kernel function
// pointer.
const l1mkr_t ker_id = ker_ids[ i ];
const num_t ker_dt = ker_dts[ i ];
void_fp ker_fp = ker_fps[ i ];
// Index into the func_t and mbool_t for the current kernel id
// being processed.
func_t* kers = &cntx_packm_kers[ ker_id ];
// Store the ukernel function pointer and preference values into
// the context.
bli_func_set_dt( ker_fp, ker_dt, kers );
}
// Free the temporary local arrays.
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_packm_kers(): " );
#endif
bli_free_intl( ker_ids );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_packm_kers(): " );
#endif
bli_free_intl( ker_dts );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_cntx_set_packm_kers(): " );
#endif
bli_free_intl( ker_fps );
}
// -----------------------------------------------------------------------------
void bli_cntx_print( cntx_t* cntx )
{
dim_t i;
// Print the values stored in the blksz_t objects.
printf( " s d c z\n" );
for ( i = 0; i < BLIS_NUM_BLKSZS; ++i )
{
printf( "blksz/mult %2lu: %13lu/%2lu %13lu/%2lu %13lu/%2lu %13lu/%2lu\n",
( unsigned long )i,
( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_FLOAT, i, cntx ),
( unsigned long )bli_cntx_get_bmult_dt ( BLIS_FLOAT, i, cntx ),
( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, i, cntx ),
( unsigned long )bli_cntx_get_bmult_dt ( BLIS_DOUBLE, i, cntx ),
( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, i, cntx ),
( unsigned long )bli_cntx_get_bmult_dt ( BLIS_SCOMPLEX, i, cntx ),
( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, i, cntx ),
( unsigned long )bli_cntx_get_bmult_dt ( BLIS_DCOMPLEX, i, cntx )
);
}
for ( i = 0; i < BLIS_NUM_LEVEL3_UKRS; ++i )
{
func_t* ukr = bli_cntx_get_l3_vir_ukrs( i, cntx );
printf( "l3 vir ukr %2lu: %16p %16p %16p %16p\n",
( unsigned long )i,
bli_func_get_dt( BLIS_FLOAT, ukr ),
bli_func_get_dt( BLIS_DOUBLE, ukr ),
bli_func_get_dt( BLIS_SCOMPLEX, ukr ),
bli_func_get_dt( BLIS_DCOMPLEX, ukr )
);
}
for ( i = 0; i < BLIS_NUM_3OP_RC_COMBOS; ++i )
{
func_t* ukr = bli_cntx_get_l3_sup_kers( i, cntx );
printf( "l3 sup ukr %2lu: %16p %16p %16p %16p\n",
( unsigned long )i,
bli_func_get_dt( BLIS_FLOAT, ukr ),
bli_func_get_dt( BLIS_DOUBLE, ukr ),
bli_func_get_dt( BLIS_SCOMPLEX, ukr ),
bli_func_get_dt( BLIS_DCOMPLEX, ukr )
);
}
for ( i = 0; i < BLIS_NUM_LEVEL1F_KERS; ++i )
{
func_t* ker = bli_cntx_get_l1f_kers( i, cntx );
printf( "l1f ker %2lu: %16p %16p %16p %16p\n",
( unsigned long )i,
bli_func_get_dt( BLIS_FLOAT, ker ),
bli_func_get_dt( BLIS_DOUBLE, ker ),
bli_func_get_dt( BLIS_SCOMPLEX, ker ),
bli_func_get_dt( BLIS_DCOMPLEX, ker )
);
}
for ( i = 0; i < BLIS_NUM_LEVEL1V_KERS; ++i )
{
func_t* ker = bli_cntx_get_l1v_kers( i, cntx );
printf( "l1v ker %2lu: %16p %16p %16p %16p\n",
( unsigned long )i,
bli_func_get_dt( BLIS_FLOAT, ker ),
bli_func_get_dt( BLIS_DOUBLE, ker ),
bli_func_get_dt( BLIS_SCOMPLEX, ker ),
bli_func_get_dt( BLIS_DCOMPLEX, ker )
);
}
{
ind_t method = bli_cntx_method( cntx );
printf( "ind method : %lu\n", ( unsigned long )method );
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_cntx.h 0000664 0000000 0000000 00000052275 14634250137 0021621 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CNTX_H
#define BLIS_CNTX_H
// Context object type (defined in bli_type_defs.h)
/*
typedef struct cntx_s
{
blksz_t* blkszs;
bszid_t* bmults;
func_t* l3_vir_ukrs;
func_t* l3_nat_ukrs;
mbool_t* l3_nat_ukrs_prefs;
blksz_t* l3_sup_thresh;
void** l3_sup_handlers;
blksz_t* l3_sup_blkszs;
func_t* l3_sup_kers;
mbool_t* l3_sup_kers_prefs;
func_t* l1f_kers;
func_t* l1v_kers;
func_t* packm_kers;
func_t* unpackm_kers;
ind_t method;
} cntx_t;
*/
// -----------------------------------------------------------------------------
//
// -- cntx_t query (fields only) -----------------------------------------------
//
BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx )
{
return cntx->blkszs;
}
BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
{
return cntx->bmults;
}
BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
{
return cntx->l3_vir_ukrs;
}
BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx )
{
return cntx->l3_nat_ukrs;
}
BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx )
{
return cntx->l3_nat_ukrs_prefs;
}
BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx )
{
return cntx->l3_sup_thresh;
}
BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
{
return cntx->l3_sup_handlers;
}
BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx )
{
return cntx->l3_sup_blkszs;
}
BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx )
{
return cntx->l3_sup_kers;
}
BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx )
{
return cntx->l3_sup_kers_prefs;
}
BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx )
{
return cntx->l1f_kers;
}
BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx )
{
return cntx->l1v_kers;
}
BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx )
{
return cntx->packm_kers;
}
BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx )
{
return cntx->unpackm_kers;
}
BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
{
return cntx->method;
}
// -----------------------------------------------------------------------------
//
// -- cntx_t modification (fields only) ----------------------------------------
//
BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
{
cntx->method = method;
}
// -----------------------------------------------------------------------------
//
// -- cntx_t query (complex) ---------------------------------------------------
//
BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
// Return the address of the blksz_t identified by bs_id.
return blksz;
}
BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx );
dim_t bs_dt = bli_blksz_get_def( dt, blksz );
// Return the main (default) blocksize value for the datatype given.
return bs_dt;
}
BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx );
dim_t bs_dt = bli_blksz_get_max( dt, blksz );
// Return the auxiliary (maximum) blocksize value for the datatype given.
return bs_dt;
}
BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx )
{
bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx );
bszid_t bm_id = bmults[ bs_id ];
return bm_id;
}
BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx )
{
bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx );
blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx );
return bmult;
}
BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx );
dim_t bm_dt = bli_blksz_get_def( dt, bmult );
return bm_dt;
}
// -----------------------------------------------------------------------------
BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
func_t* func = &funcs[ ukr_id ];
return func;
}
BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx );
return bli_func_get_dt( dt, func );
}
BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
func_t* func = &funcs[ ukr_id ];
return func;
}
BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx );
return bli_func_get_dt( dt, func );
}
// -----------------------------------------------------------------------------
BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx )
{
mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
mbool_t* mbool = &mbools[ ukr_id ];
return mbool;
}
BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
return ( bool )bli_mbool_get_dt( dt, mbool );
}
// -----------------------------------------------------------------------------
BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx )
{
blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx );
blksz_t* thresh = &threshs[ thresh_id ];
// Return the address of the blksz_t identified by thresh_id.
return thresh;
}
BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx )
{
blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx );
dim_t thresh_dt = bli_blksz_get_def( dt, threshs );
// Return the main (default) threshold value for the datatype given.
return thresh_dt;
}
BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx )
{
if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE;
if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE;
if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE;
return FALSE;
}
// -----------------------------------------------------------------------------
BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
{
void** funcs = bli_cntx_l3_sup_handlers_buf( cntx );
void* func = funcs[ op ];
return func;
}
// -----------------------------------------------------------------------------
BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
// Return the address of the blksz_t identified by bs_id.
return blksz;
}
BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
dim_t bs_dt = bli_blksz_get_def( dt, blksz );
// Return the main (default) blocksize value for the datatype given.
return bs_dt;
}
BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
{
blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
dim_t bs_dt = bli_blksz_get_max( dt, blksz );
// Return the auxiliary (maximum) blocksize value for the datatype given.
return bs_dt;
}
// -----------------------------------------------------------------------------
BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx );
func_t* func = &funcs[ stor_id ];
return func;
}
BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
{
func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx );
return bli_func_get_dt( dt, func );
}
// -----------------------------------------------------------------------------
BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx )
{
mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx );
mbool_t* mbool = &mbools[ stor_id ];
return mbool;
}
BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
{
mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx );
return ( bool )bli_mbool_get_dt( dt, mbool );
}
// -----------------------------------------------------------------------------
BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l1f_kers_buf( cntx );
func_t* func = &funcs[ ker_id ];
return func;
}
BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx )
{
func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx );
return bli_func_get_dt( dt, func );
}
// -----------------------------------------------------------------------------
BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l1v_kers_buf( cntx );
func_t* func = &funcs[ ker_id ];
return func;
}
BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx )
{
func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx );
return bli_func_get_dt( dt, func );
}
// -----------------------------------------------------------------------------
BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx )
{
func_t* func = NULL;
// Only index to the requested packm func_t if the packm kernel being
// requested is one that is explicitly supported.
if ( 0 <= ( gint_t )ker_id &&
( gint_t )ker_id < BLIS_NUM_PACKM_KERS )
{
func_t* funcs = bli_cntx_packm_kers_buf( cntx );
func = &funcs[ ker_id ];
}
return func;
}
BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx )
{
void_fp fp = NULL;
// Only query the context for the packm func_t (and then extract the
// datatype-specific function pointer) if the packm kernel being
// requested is one that is explicitly supported.
if ( 0 <= ( gint_t )ker_id &&
( gint_t )ker_id < BLIS_NUM_PACKM_KERS )
{
func_t* func = bli_cntx_get_packm_kers( ker_id, cntx );
fp = bli_func_get_dt( dt, func );
}
return fp;
}
BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx )
{
func_t* func = NULL;
// Only index to the requested unpackm func_t if the unpackm kernel being
// requested is one that is explicitly supported.
if ( 0 <= ( gint_t )ker_id &&
( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS )
{
func_t* funcs = bli_cntx_unpackm_kers_buf( cntx );
func = &funcs[ ker_id ];
}
return func;
}
BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx )
{
void_fp fp = NULL;
// Only query the context for the unpackm func_t (and then extract the
// datatype-specific function pointer) if the unpackm kernel being
// requested is one that is explicitly supported.
if ( 0 <= ( gint_t )ker_id &&
( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS )
{
func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx );
fp = bli_func_get_dt( dt, func );
}
return fp;
}
// -----------------------------------------------------------------------------
BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx );
// A ukernel preference of TRUE means the ukernel prefers row storage.
return ( bool )
( prefs == TRUE );
}
BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx );
// A ukernel preference of FALSE means the ukernel prefers column storage.
return ( bool )
( prefs == FALSE );
}
BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
{
// Note that we use the computation datatype, which may differ from the
// storage datatype of C (when performing a mixed datatype operation).
const num_t dt = bli_obj_comp_dt( obj );
const bool ukr_prefers_rows
= bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
const bool ukr_prefers_cols
= bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
bool r_val = FALSE;
if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
return r_val;
}
BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
{
return ( bool )
!bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
}
// -----------------------------------------------------------------------------
BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
// For induced methods, return the ukernel storage preferences of the
// corresponding real micro-kernel.
// NOTE: This projection to real domain becomes unnecessary if you
// set the exec_dt for 1m to the real projection of the storage
// datatype.
if ( bli_cntx_method( cntx ) != BLIS_NAT )
dt = bli_dt_proj_to_real( dt );
return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
}
BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
{
// For induced methods, return the ukernel storage preferences of the
// corresponding real micro-kernel.
// NOTE: This projection to real domain becomes unnecessary if you
// set the exec_dt for 1m to the real projection of the storage
// datatype.
if ( bli_cntx_method( cntx ) != BLIS_NAT )
dt = bli_dt_proj_to_real( dt );
return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
}
BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
{
// Note that we use the computation datatype, which may differ from the
// storage datatype of C (when performing a mixed datatype operation).
const num_t dt = bli_obj_comp_dt( obj );
const bool ukr_prefers_rows
= bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx );
const bool ukr_prefers_cols
= bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx );
bool r_val = FALSE;
if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
return r_val;
}
BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
{
return ( bool )
!bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx );
}
// -----------------------------------------------------------------------------
BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
{
const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
// A ukernel preference of TRUE means the ukernel prefers row storage.
return ( bool )
( prefs == TRUE );
}
BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
{
const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
// A ukernel preference of FALSE means the ukernel prefers column storage.
return ( bool )
( prefs == FALSE );
}
#if 0
// NOTE: These static functions aren't needed yet.
BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
{
const num_t dt = bli_obj_dt( obj );
const bool ukr_prefers_rows
= bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
const bool ukr_prefers_cols
= bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx );
bool r_val = FALSE;
if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
return r_val;
}
BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
{
return ( bool )
!bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx );
}
#endif
// -----------------------------------------------------------------------------
//
// -- cntx_t modification (complex) --------------------------------------------
//
// NOTE: The framework does not use any of the following functions. We provide
// them in order to facilitate creating/modifying custom contexts.
BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
bszid_t* bmults = bli_cntx_bmults_buf( cntx );
blkszs[ bs_id ] = *blksz;
bmults[ bs_id ] = mult_id;
}
BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
bli_blksz_set_def( bs, dt, blksz );
}
BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
{
blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
blksz_t* blksz = &blkszs[ bs_id ];
bli_blksz_set_max( bs, dt, blksz );
}
BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
funcs[ ukr_id ] = *func;
}
BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
funcs[ ukr_id ] = *func;
}
BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx )
{
mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
mbools[ ukr_id ] = *prefs;
}
BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l1f_kers_buf( cntx );
funcs[ ker_id ] = *func;
}
BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx )
{
func_t* funcs = bli_cntx_l1v_kers_buf( cntx );
funcs[ ker_id ] = *func;
}
BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx )
{
func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx );
funcs[ ker_id ] = *func;
}
BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx )
{
func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx );
bli_func_set_dt( fp, dt, func );
}
BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx )
{
func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx );
funcs[ ker_id ] = *func;
}
BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx )
{
func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx );
bli_func_set_dt( fp, dt, func );
}
// -----------------------------------------------------------------------------
// Function prototypes
BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... );
BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... );
BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_const.c 0000664 0000000 0000000 00000006410 14634250137 0021754 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Statically initialize structs containing representations of various
// constants for each datatype supported in BLIS.
static constdata_t bli_two_buffer = bli_obj_init_constdata( 2.0 );
static constdata_t bli_one_buffer = bli_obj_init_constdata( 1.0 );
static constdata_t bli_zero_buffer = bli_obj_init_constdata( 0.0 );
static constdata_t bli_mone_buffer = bli_obj_init_constdata( -1.0 );
static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 );
// Statically initialize global scalar constants, attaching the addresses
// of the corresponding structs above.
obj_t BLIS_TWO = bli_obj_init_const( &bli_two_buffer );
obj_t BLIS_ONE = bli_obj_init_const( &bli_one_buffer );
obj_t BLIS_ZERO = bli_obj_init_const( &bli_zero_buffer );
obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer );
obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer );
#if 0
obj_t BLIS_TWO = {};
obj_t BLIS_ONE = {};
obj_t BLIS_ZERO = {};
obj_t BLIS_MINUS_ONE = {};
obj_t BLIS_MINUS_TWO = {};
void bli_const_init( void )
{
bli_obj_create_const( 2.0, &BLIS_TWO );
bli_obj_create_const( 1.0, &BLIS_ONE );
bli_obj_create_const( 0.5, &BLIS_ONE_HALF );
bli_obj_create_const( 0.0, &BLIS_ZERO );
bli_obj_create_const( -0.5, &BLIS_MINUS_ONE_HALF );
bli_obj_create_const( -1.0, &BLIS_MINUS_ONE );
bli_obj_create_const( -2.0, &BLIS_MINUS_TWO );
}
void bli_const_finalize( void )
{
bli_obj_free( &BLIS_TWO );
bli_obj_free( &BLIS_ONE );
bli_obj_free( &BLIS_ONE_HALF );
bli_obj_free( &BLIS_ZERO );
bli_obj_free( &BLIS_MINUS_ONE_HALF );
bli_obj_free( &BLIS_MINUS_ONE );
bli_obj_free( &BLIS_MINUS_TWO );
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_const.h 0000664 0000000 0000000 00000003310 14634250137 0021755 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_const_init( void );
void bli_const_finalize( void );
cython-blis-1.0.0/blis/_src/frame/base/bli_cpuid.c 0000664 0000000 0000000 00000116417 14634250137 0021743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
Copyright (C) 2019, Dave Love, University of Manchester
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if 0
// Used only during standalone testing of ARM support.
#include "bli_system.h"
#include "bli_type_defs.h"
#include "bli_cpuid.h"
#undef __x86_64__
#undef _M_X64
#undef __i386
#undef _M_IX86
#define __arm__
#endif
#ifdef BLIS_CONFIGURETIME_CPUID
// NOTE: If you need to make any changes to this cpp branch, it's probably
// the case that you also need to modify bli_arch.c, bli_cpuid.c, and
// bli_env.c. Don't forget to update these other files as needed!
// The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp
// branch in bli_system.h is processed. (This macro is normally defined in
// bli_config.h.)
#define BLIS_ENABLE_SYSTEM
// Use C-style static inline functions for any static inline functions that
// happen to be defined by the headers below. (This macro is normally defined
// in bli_config_macro_defs.h.)
#define BLIS_INLINE static
// Since we're not building a shared library, we can forgo the use of the
// BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro
// is normally defined in bli_config_macro_defs.h.)
#define BLIS_EXPORT_BLIS
#include "bli_system.h"
#include "bli_type_defs.h"
#include "bli_arch.h"
#include "bli_cpuid.h"
//#include "bli_env.h"
#else
#include "blis.h"
#endif
// -----------------------------------------------------------------------------
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
#include "cpuid.h"
arch_t bli_cpuid_query_id( void )
{
uint32_t vendor, family, model, features;
// Call the CPUID instruction and parse its results into a family id,
// model id, and a feature bit field. The return value encodes the
// vendor.
vendor = bli_cpuid_query( &family, &model, &features );
#if 0
printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" );
printf("family = %x\n", family );
printf( "model = %x\n", model );
printf( "features = %x\n", features );
#endif
if ( vendor == VENDOR_INTEL )
{
// Check for each Intel configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_SKX
if ( bli_cpuid_is_skx( family, model, features ) )
return BLIS_ARCH_SKX;
#endif
#ifdef BLIS_CONFIG_KNL
if ( bli_cpuid_is_knl( family, model, features ) )
return BLIS_ARCH_KNL;
#endif
#ifdef BLIS_CONFIG_HASWELL
if ( bli_cpuid_is_haswell( family, model, features ) )
return BLIS_ARCH_HASWELL;
#endif
#ifdef BLIS_CONFIG_SANDYBRIDGE
if ( bli_cpuid_is_sandybridge( family, model, features ) )
return BLIS_ARCH_SANDYBRIDGE;
#endif
#ifdef BLIS_CONFIG_PENRYN
if ( bli_cpuid_is_penryn( family, model, features ) )
return BLIS_ARCH_PENRYN;
#endif
// If none of the other sub-configurations were detected, return
// the 'generic' arch_t id value.
return BLIS_ARCH_GENERIC;
}
else if ( vendor == VENDOR_AMD )
{
// Check for each AMD configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_ZEN3
if ( bli_cpuid_is_zen3( family, model, features ) )
return BLIS_ARCH_ZEN3;
#endif
#ifdef BLIS_CONFIG_ZEN2
if ( bli_cpuid_is_zen2( family, model, features ) )
return BLIS_ARCH_ZEN2;
#endif
#ifdef BLIS_CONFIG_ZEN
if ( bli_cpuid_is_zen( family, model, features ) )
return BLIS_ARCH_ZEN;
#endif
#ifdef BLIS_CONFIG_EXCAVATOR
if ( bli_cpuid_is_excavator( family, model, features ) )
return BLIS_ARCH_EXCAVATOR;
#endif
#ifdef BLIS_CONFIG_STEAMROLLER
if ( bli_cpuid_is_steamroller( family, model, features ) )
return BLIS_ARCH_STEAMROLLER;
#endif
#ifdef BLIS_CONFIG_PILEDRIVER
if ( bli_cpuid_is_piledriver( family, model, features ) )
return BLIS_ARCH_PILEDRIVER;
#endif
#ifdef BLIS_CONFIG_BULLDOZER
if ( bli_cpuid_is_bulldozer( family, model, features ) )
return BLIS_ARCH_BULLDOZER;
#endif
// If none of the other sub-configurations were detected, return
// the 'generic' arch_t id value.
return BLIS_ARCH_GENERIC;
}
else if ( vendor == VENDOR_UNKNOWN )
{
return BLIS_ARCH_GENERIC;
}
return BLIS_ARCH_GENERIC;
}
// -----------------------------------------------------------------------------
bool bli_cpuid_is_skx
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2 |
FEATURE_AVX512F |
FEATURE_AVX512DQ |
FEATURE_AVX512BW |
FEATURE_AVX512VL ;
int nvpu = vpu_count();
if ( bli_cpuid_has_features( features, expected ) )
{
switch ( nvpu )
{
case 1:
bli_arch_log( "Hardware has 1 FMA unit; using 'haswell' (not 'skx') sub-config.\n" );
return FALSE;
case 2:
bli_arch_log( "Hardware has 2 FMA units; using 'skx' sub-config.\n" );
return TRUE;
default:
bli_arch_log( "Number of FMA units unknown; using 'haswell' (not 'skx') config.\n" );
return FALSE;
}
}
else
return FALSE;
return TRUE;
}
bool bli_cpuid_is_knl
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2 |
FEATURE_AVX512F |
FEATURE_AVX512PF;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_haswell
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_sandybridge
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_penryn
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_SSE3 |
FEATURE_SSSE3;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
return TRUE;
}
// -----------------------------------------------------------------------------
bool bli_cpuid_is_zen3
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Zen3 cores have a family of 0x19.
if ( family != 0x19 ) return FALSE;
// Finally, check for specific models:
// - 0x00 ~ 0xff
// NOTE: We accept any model because the family 25 (0x19) is unique.
const bool is_arch
=
( 0x00 <= model && model <= 0xff );
if ( !is_arch ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_zen2
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Zen2 cores have a family of 0x17.
if ( family == 0x17 ) {
return 0x30 <= model && model <= 0xff;
}
#ifndef BLIS_CONFIG_ZEN3
// Fallback to Zen 2 kernels on Zen 3, when blis is compiled without
// Zen 3 support (e.g. because it requires a newer compiler).
if ( family == 0x19 ) {
return 0x00 <= model && model <= 0xff;
}
#endif
return FALSE;
}
bool bli_cpuid_is_zen
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Zen cores have a family of 0x17.
if ( family != 0x17 ) return FALSE;
// Finally, check for specific models:
// - 0x00 ~ 0x2f
// NOTE: We must check model because the family 23 (0x17) is shared with
// zen2.
const bool is_arch
=
( 0x00 <= model && model <= 0x2f );
if ( !is_arch ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_excavator
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_AVX2;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Excavator cores have a family of 0x15.
if ( family != 0x15 ) return FALSE;
// Finally, check for specific models:
// - 0x60 ~ 0x7f
const bool is_arch
=
( 0x60 <= model && model <= 0x7f );
if ( !is_arch ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_steamroller
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_FMA4;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Steamroller cores have a family of 0x15.
if ( family != 0x15 ) return FALSE;
// Finally, check for specific models:
// - 0x30 ~ 0x3f
const bool is_arch
=
( 0x30 <= model && model <= 0x3f );
if ( !is_arch ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_piledriver
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA3 |
FEATURE_FMA4;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Piledriver cores have a family of 0x15.
if ( family != 0x15 ) return FALSE;
// Finally, check for specific models:
// - 0x02
// - 0x10 ~ 0x1f
const bool is_arch
=
model == 0x02 || ( 0x10 <= model && model <= 0x1f );
if ( !is_arch ) return FALSE;
return TRUE;
}
bool bli_cpuid_is_bulldozer
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_AVX |
FEATURE_FMA4;
if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
// All Bulldozer cores have a family of 0x15.
if ( family != 0x15 ) return FALSE;
// Finally, check for specific models:
// - 0x00
// - 0x01
const bool is_arch
=
( model == 0x00 || model == 0x01 );
if ( !is_arch ) return FALSE;
return TRUE;
}
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
arch_t bli_cpuid_query_id( void )
{
uint32_t vendor, model, part, features;
vendor = bli_cpuid_query( &model, &part, &features );
#if 0
printf( "vendor = %u\n", vendor );
printf( "model = %u\n", model );
printf( "part = 0x%x\n", part );
printf( "features = %u\n", features );
#endif
if ( vendor == VENDOR_ARM )
{
if ( model == MODEL_ARMV8 )
{
return part;
// Check for each ARMv8 configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
// If none of the other sub-configurations were detected, return
// the 'generic' arch_t id value.
return BLIS_ARCH_GENERIC;
}
else if ( model == MODEL_ARMV7 )
{
// Check for each ARMv7 configuration that is enabled, check for that
// microarchitecture. We check from most recent to most dated.
#ifdef BLIS_CONFIG_CORTEXA15
if ( bli_cpuid_is_cortexa15( model, part, features ) )
return BLIS_ARCH_CORTEXA15;
#endif
#ifdef BLIS_CONFIG_CORTEXA9
if ( bli_cpuid_is_cortexa9( model, part, features ) )
return BLIS_ARCH_CORTEXA9;
#endif
// If none of the other sub-configurations were detected, return
// the 'generic' arch_t id value.
return BLIS_ARCH_GENERIC;
}
}
else if ( vendor == VENDOR_UNKNOWN )
{
return BLIS_ARCH_GENERIC;
}
return BLIS_ARCH_GENERIC;
}
bool bli_cpuid_is_cortexa15
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
return bli_cpuid_has_features( features, expected ) && model == 0xc0f;
}
bool bli_cpuid_is_cortexa9
(
uint32_t family,
uint32_t model,
uint32_t features
)
{
// Check for expected CPU features.
const uint32_t expected = FEATURE_NEON;
return bli_cpuid_has_features( features, expected ) && model == 0xc09;
}
#endif
// -----------------------------------------------------------------------------
//
// This section of the file was based off of cpuid.cxx from TBLIS [1].
//
// [1] https://github.com/devinamatthews/tblis
//
/*
Copyright (C) 2017, The University of Texas at Austin
Copyright (C) 2017, Devin Matthews
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
enum
{
// input register(s) output register
FEATURE_MASK_SSE3 = (1u<< 0), // cpuid[eax=1] :ecx[0]
FEATURE_MASK_SSSE3 = (1u<< 9), // cpuid[eax=1] :ecx[9]
FEATURE_MASK_SSE41 = (1u<<19), // cpuid[eax=1] :ecx[19]
FEATURE_MASK_SSE42 = (1u<<20), // cpuid[eax=1] :ecx[20]
FEATURE_MASK_AVX = (1u<<28), // cpuid[eax=1] :ecx[28]
FEATURE_MASK_AVX2 = (1u<< 5), // cpuid[eax=7,ecx=0] :ebx[5]
FEATURE_MASK_FMA3 = (1u<<12), // cpuid[eax=1] :ecx[12]
FEATURE_MASK_FMA4 = (1u<<16), // cpuid[eax=0x80000001]:ecx[16]
FEATURE_MASK_AVX512F = (1u<<16), // cpuid[eax=7,ecx=0] :ebx[16]
FEATURE_MASK_AVX512DQ = (1u<<17), // cpuid[eax=7,ecx=0] :ebx[17]
FEATURE_MASK_AVX512PF = (1u<<26), // cpuid[eax=7,ecx=0] :ebx[26]
FEATURE_MASK_AVX512ER = (1u<<27), // cpuid[eax=7,ecx=0] :ebx[27]
FEATURE_MASK_AVX512CD = (1u<<28), // cpuid[eax=7,ecx=0] :ebx[28]
FEATURE_MASK_AVX512BW = (1u<<30), // cpuid[eax=7,ecx=0] :ebx[30]
FEATURE_MASK_AVX512VL = (1u<<31), // cpuid[eax=7,ecx=0] :ebx[31]
FEATURE_MASK_XGETBV = (1u<<26)|
(1u<<27), // cpuid[eax=1] :ecx[27:26]
XGETBV_MASK_XMM = 0x02u, // xcr0[1]
XGETBV_MASK_YMM = 0x04u, // xcr0[2]
XGETBV_MASK_ZMM = 0xe0u // xcr0[7:5]
};
uint32_t bli_cpuid_query
(
uint32_t* family,
uint32_t* model,
uint32_t* features
)
{
uint32_t eax, ebx, ecx, edx;
uint32_t old_model = 0;
uint32_t old_family = 0;
uint32_t ext_model = 0;
uint32_t ext_family = 0;
*family = 0;
*model = 0;
*features = 0;
//fprintf( stderr, "checking cpuid\n" );
uint32_t cpuid_max = __get_cpuid_max( 0, 0 );
uint32_t cpuid_max_ext = __get_cpuid_max( 0x80000000u, 0 );
//fprintf( stderr, "max cpuid leaf: %d\n", cpuid_max );
//fprintf( stderr, "max extended cpuid leaf: %08x\n", cpuid_max_ext );
if ( cpuid_max < 1 ) return VENDOR_UNKNOWN;
// The fourth '0' serves as the NULL-terminator for the vendor string.
uint32_t vendor_string[4] = { 0, 0, 0, 0 };
// This is actually a macro that modifies the last four operands,
// hence why they are not passed by address.
__cpuid( 0, eax, vendor_string[0],
vendor_string[2],
vendor_string[1] );
// Check extended feature bits for post-AVX2 features.
if ( cpuid_max >= 7 )
{
// This is actually a macro that modifies the last four operands,
// hence why they are not passed by address.
__cpuid_count( 7, 0, eax, ebx, ecx, edx );
//fprintf( stderr, "cpuid leaf 7:\n" );
//print_binary( eax );
//print_binary( ebx );
//print_binary( ecx );
//print_binary( edx );
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX2 ) ) *features |= FEATURE_AVX2;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512F ) ) *features |= FEATURE_AVX512F;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512DQ ) ) *features |= FEATURE_AVX512DQ;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512PF ) ) *features |= FEATURE_AVX512PF;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512ER ) ) *features |= FEATURE_AVX512ER;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512CD ) ) *features |= FEATURE_AVX512CD;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512BW ) ) *features |= FEATURE_AVX512BW;
if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512VL ) ) *features |= FEATURE_AVX512VL;
}
// Check extended processor info / features bits for AMD-specific features.
if ( cpuid_max_ext >= 0x80000001u )
{
// This is actually a macro that modifies the last four operands,
// hence why they are not passed by address.
__cpuid( 0x80000001u, eax, ebx, ecx, edx );
//fprintf(stderr, "extended cpuid leaf 0x80000001:\n");
//print_binary(eax);
//print_binary(ebx);
//print_binary(ecx);
//print_binary(edx);
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA4 ) ) *features |= FEATURE_FMA4;
}
// Unconditionally check processor info / features bits.
{
// This is actually a macro that modifies the last four operands,
// hence why they are not passed by address.
__cpuid( 1, eax, ebx, ecx, edx );
//fprintf(stderr, "cpuid leaf 1:\n");
//print_binary(eax);
//print_binary(ebx);
//print_binary(ecx);
//print_binary(edx);
/*
cpuid(eax=1): eax[27:0]
3: 0 - Stepping
7: 4 - Model
11: 8 - Family
13:12 - Processor Type
19:16 - Extended Model
27:20 - Extended Family
Intel and AMD have suggested applications to display the family of a
CPU as the sum of the "Family" and the "Extended Family" fields shown
above, and the model as the sum of the "Model" and the 4-bit
left-shifted "Extended Model" fields. If "Family" is different than
6 or 15, only the "Family" and "Model" fields should be used while the
"Extended Family" and "Extended Model" bits are reserved. If "Family"
is set to 15, then "Extended Family" and the 4-bit left-shifted
"Extended Model" should be added to the respective base values, and if
"Family" is set to 6, then only the 4-bit left-shifted "Extended Model"
should be added to "Model".
*/
old_model = ( eax >> 4 ) & ( 0xF ); // bits 7:4
old_family = ( eax >> 8 ) & ( 0xF ); // bits 11:8
ext_model = ( eax >> 16 ) & ( 0xF ); // bits 19:16
ext_family = ( eax >> 20 ) & ( 0xFF ); // bits 27:20
// Set the display model and family values based on the original family
// value. See explanation above.
if ( old_family == 6 )
{
*model = ( ext_model << 4 ) + old_model;
*family = old_family;
}
else if ( old_family == 15 )
{
*model = ( ext_model << 4 ) + old_model;
*family = ( ext_family ) + old_family;
}
else
{
*model = old_model;
*family = old_family;
}
// Check for SSE, AVX, and FMA3 features.
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE3 ) ) *features |= FEATURE_SSE3;
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSSE3 ) ) *features |= FEATURE_SSSE3;
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE41 ) ) *features |= FEATURE_SSE41;
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE42 ) ) *features |= FEATURE_SSE42;
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_AVX ) ) *features |= FEATURE_AVX;
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA3 ) ) *features |= FEATURE_FMA3;
// Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND
// support for these is enabled by the OS. If so, then we proceed with
// checking that various register-state saving features are available.
if ( bli_cpuid_has_features( ecx, FEATURE_MASK_XGETBV ) )
{
uint32_t xcr = 0;
// Call xgetbv to get xcr0 (the extended control register) copied
// to [edx:eax]. This encodes whether software supports various
// register state-saving features.
__asm__ __volatile__
(
".byte 0x0F, 0x01, 0xD0"
: "=a" (eax),
"=d" (edx)
: "c" (xcr)
: "cc"
);
//fprintf(stderr, "xcr0:\n");
//print_binary(eax);
//print_binary(edx);
//fprintf(stderr, "xgetbv: xmm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM));
//fprintf(stderr, "xgetbv: ymm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM|
// XGETBV_MASK_YMM));
//fprintf(stderr, "xgetbv: zmm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM|
// XGETBV_MASK_YMM|
// XGETBV_MASK_ZMM));
// The OS can manage the state of 512-bit zmm (AVX-512) registers
// only if the xcr[7:5] bits are set. If they are not set, then
// clear all feature bits related to AVX-512.
if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM |
XGETBV_MASK_YMM |
XGETBV_MASK_ZMM ) )
{
*features &= ~( FEATURE_AVX512F |
FEATURE_AVX512DQ |
FEATURE_AVX512PF |
FEATURE_AVX512ER |
FEATURE_AVX512CD |
FEATURE_AVX512BW |
FEATURE_AVX512VL );
}
// The OS can manage the state of 256-bit ymm (AVX) registers
// only if the xcr[2] bit is set. If it is not set, then
// clear all feature bits related to AVX.
if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM |
XGETBV_MASK_YMM ) )
{
*features &= ~( FEATURE_AVX |
FEATURE_AVX2 |
FEATURE_FMA3 |
FEATURE_FMA4 );
}
// The OS can manage the state of 128-bit xmm (SSE) registers
// only if the xcr[1] bit is set. If it is not set, then
// clear all feature bits related to SSE (which means the
// entire bitfield is clear).
if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM ) )
{
*features = 0;
}
}
else
{
// If the hardware does not support xsave/xrestor/xsetbv/xgetbv,
// OR these features are not enabled by the OS, then we clear
// the bitfield, because it means that not even xmm support is
// present.
//fprintf(stderr, "xgetbv: no\n");
features = 0;
}
}
//fprintf(stderr, "vendor: %12s\n", vendor_string);
//fprintf(stderr, "family: %d\n", family);
//fprintf(stderr, "model: %d\n", model);
//fprintf(stderr, "sse3: %d\n", bli_cpuid_has_features(features, FEATURE_SSE3));
//fprintf(stderr, "ssse3: %d\n", bli_cpuid_has_features(features, FEATURE_SSSE3));
//fprintf(stderr, "sse4.1: %d\n", bli_cpuid_has_features(features, FEATURE_SSE41));
//fprintf(stderr, "sse4.2: %d\n", bli_cpuid_has_features(features, FEATURE_SSE42));
//fprintf(stderr, "avx: %d\n", bli_cpuid_has_features(features, FEATURE_AVX));
//fprintf(stderr, "avx2: %d\n", bli_cpuid_has_features(features, FEATURE_AVX2));
//fprintf(stderr, "fma3: %d\n", bli_cpuid_has_features(features, FEATURE_FMA3));
//fprintf(stderr, "fma4: %d\n", bli_cpuid_has_features(features, FEATURE_FMA4));
//fprintf(stderr, "avx512f: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512F));
//fprintf(stderr, "avx512pf: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512PF));
//fprintf(stderr, "avx512dq: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512DQ));
// Check the vendor string and return a value to indicate Intel or AMD.
if ( strcmp( ( char* )vendor_string, "AuthenticAMD" ) == 0 )
return VENDOR_AMD;
else if ( strcmp( ( char* )vendor_string, "GenuineIntel" ) == 0 )
return VENDOR_INTEL;
else
return VENDOR_UNKNOWN;
}
void get_cpu_name( char *cpu_name )
{
uint32_t eax, ebx, ecx, edx;
__cpuid( 0x80000002u, eax, ebx, ecx, edx );
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
*( uint32_t* )&cpu_name[0 + 0] = eax;
*( uint32_t* )&cpu_name[0 + 4] = ebx;
*( uint32_t* )&cpu_name[0 + 8] = ecx;
*( uint32_t* )&cpu_name[0 +12] = edx;
__cpuid( 0x80000003u, eax, ebx, ecx, edx );
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
*( uint32_t* )&cpu_name[16+ 0] = eax;
*( uint32_t* )&cpu_name[16+ 4] = ebx;
*( uint32_t* )&cpu_name[16+ 8] = ecx;
*( uint32_t* )&cpu_name[16+12] = edx;
__cpuid( 0x80000004u, eax, ebx, ecx, edx );
//printf("%x %x %x %x\n", eax, ebx, ecx, edx);
*( uint32_t* )&cpu_name[32+ 0] = eax;
*( uint32_t* )&cpu_name[32+ 4] = ebx;
*( uint32_t* )&cpu_name[32+ 8] = ecx;
*( uint32_t* )&cpu_name[32+12] = edx;
}
// Return the number of FMA units _assuming avx512 is supported_.
// This needs updating for new processor types, sigh.
// See https://ark.intel.com/content/www/us/en/ark.html#@Processors
// and also https://github.com/jeffhammond/vpu-count
int vpu_count( void )
{
char cpu_name[48] = {};
char* loc;
char model_num[5];
int sku;
get_cpu_name( cpu_name );
if ( strstr( cpu_name, "Intel(R) Xeon(R)" ) != NULL )
{
if (( loc = strstr( cpu_name, "Platinum" ) ))
return 2;
if ( loc == NULL )
loc = strstr( cpu_name, "Gold" ); // 1 or 2, tested below
if ( loc == NULL )
if (( loc = strstr( cpu_name, "Silver" ) ))
return 1;
if ( loc == NULL )
if (( loc = strstr( cpu_name, "Bronze" ) ))
return 1;
if ( loc == NULL )
loc = strstr( cpu_name, "W" );
if ( loc == NULL )
if (( loc = strstr( cpu_name, "D" ) ))
// Fixme: May be wrong
//
return 1;
if ( loc == NULL )
return -1;
// We may have W-nnnn rather than, say, Gold nnnn
if ( 'W' == *loc && '-' == *(loc+1) )
loc++;
else
loc = strstr( loc+1, " " );
if ( loc == NULL )
return -1;
strncpy( model_num, loc+1, 4 );
model_num[4] = '\0'; // Things like i9-10900X matched above
sku = atoi( model_num );
// These were derived from ARK listings as of 2019-10-09, but
// may not be complete, especially as the ARK Skylake listing
// seems to be limited.
if ( 8199 >= sku && sku >= 8100 ) return 2;
else if ( 6199 >= sku && sku >= 6100 ) return 2;
else if ( sku == 5122 ) return 2;
else if ( 6299 >= sku && sku >= 6200 ) return 2; // Cascade Lake Gold
else if ( 5299 >= sku && sku >= 5200 ) return 1; // Cascade Lake Gold
else if ( 5199 >= sku && sku >= 5100 ) return 1;
else if ( 4199 >= sku && sku >= 4100 ) return 1;
else if ( 3199 >= sku && sku >= 3100 ) return 1;
else if ( 3299 >= sku && sku >= 3200 ) return 2; // Cascade Lake W
else if ( 2299 >= sku && sku >= 2200 ) return 2; // Cascade Lake W
else if ( 2199 >= sku && sku >= 2120 ) return 2;
else if ( 2102 == sku || sku == 2104 ) return 2; // Gold exceptions
else if ( 2119 >= sku && sku >= 2100 ) return 1;
else return -1;
}
else if ( strstr( cpu_name, "Intel(R) Core(TM)" ) != NULL )
return 2; // All i7/i9 with avx512?
else
{
return -1;
}
}
#elif defined(__aarch64__)
#ifdef __linux__
// This is adapted from OpenBLAS. See
// https://www.kernel.org/doc/html/latest/arm64/cpu-feature-registers.html
// for the mechanism, but not the magic numbers.
// Fixme: Could these be missing in older Linux?
#include
#include
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
/* From https://www.kernel.org/doc/html/latest/arm64/sve.html and the
aarch64 hwcap.h */
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
/* Maybe also for AT_HWCAP2
#define HWCAP2_SVE2(1 << 1)
et al
) */
#endif //__linux__
#ifdef __APPLE__
#include
// #include
#endif
static uint32_t get_coretype
(
uint32_t* features
)
{
int implementer = 0x00, part = 0x000;
*features = FEATURE_NEON;
#ifdef __linux__
if ( getauxval( AT_HWCAP ) & HWCAP_CPUID )
{
// Also available from
// /sys/devices/system/cpu/cpu0/regs/identification/midr_el1
// and split out in /proc/cpuinfo (with a tab before the colon):
// CPU part : 0x0a1
uint64_t midr_el1;
__asm("mrs %0, MIDR_EL1" : "=r" (midr_el1));
/*
* MIDR_EL1
*
* 31 24 23 20 19 16 15 4 3 0
* -----------------------------------------------------------------
* | Implementer | Variant | Architecture | Part Number | Revision |
* -----------------------------------------------------------------
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
}
bool has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE;
if (has_sve)
*features |= FEATURE_SVE;
#endif //__linux__
#ifdef __APPLE__
// Better values could be obtained from sysctlbyname()
implementer = 0x61; //Apple
part = 0x023; //Firestorm
#endif //__APPLE__
// From Linux arch/arm64/include/asm/cputype.h
// ARM_CPU_IMP_ARM 0x41
// ARM_CPU_IMP_APM 0x50
// ARM_CPU_IMP_CAVIUM 0x43
// ARM_CPU_IMP_BRCM 0x42
// ARM_CPU_IMP_QCOM 0x51
// ARM_CPU_IMP_NVIDIA 0x4E
// ARM_CPU_IMP_FUJITSU 0x46
// ARM_CPU_IMP_HISI 0x48
// ARM_CPU_IMP_APPLE 0x61
//
// ARM_CPU_PART_AEM_V8 0xD0F
// ARM_CPU_PART_FOUNDATION 0xD00
// ARM_CPU_PART_CORTEX_A57 0xD07
// ARM_CPU_PART_CORTEX_A72 0xD08
// ARM_CPU_PART_CORTEX_A53 0xD03
// ARM_CPU_PART_CORTEX_A73 0xD09
// ARM_CPU_PART_CORTEX_A75 0xD0A
// ARM_CPU_PART_CORTEX_A35 0xD04
// ARM_CPU_PART_CORTEX_A55 0xD05
// ARM_CPU_PART_CORTEX_A76 0xD0B
// ARM_CPU_PART_NEOVERSE_N1 0xD0C
// ARM_CPU_PART_CORTEX_A77 0xD0D
// from GCC:
// ARM_CPU_PART_CORTEX_A78 0xd41
// ARM_CPU_PART_CORTEX_X1 0xd44
// ARM_CPU_PART_CORTEX_V1 0xd40
// ARM_CPU_PART_CORTEX_N2 0xd49
// ARM_CPU_PART_CORTEX_R82 0xd15
//
// APM_CPU_PART_POTENZA 0x000
//
// CAVIUM_CPU_PART_THUNDERX 0x0A1
// CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2
// CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
// CAVIUM_CPU_PART_THUNDERX2 0x0AF
// CAVIUM_CPU_PART_THUNDERX3 0x0B8 // taken from OpenBLAS
//
// BRCM_CPU_PART_BRAHMA_B53 0x100
// BRCM_CPU_PART_VULCAN 0x516
//
// QCOM_CPU_PART_FALKOR_V1 0x800
// QCOM_CPU_PART_FALKOR 0xC00
// QCOM_CPU_PART_KRYO 0x200
// QCOM_CPU_PART_KRYO_3XX_SILVER 0x803
// QCOM_CPU_PART_KRYO_4XX_GOLD 0x804
// QCOM_CPU_PART_KRYO_4XX_SILVER 0x805
//
// NVIDIA_CPU_PART_DENVER 0x003
// NVIDIA_CPU_PART_CARMEL 0x004
//
// FUJITSU_CPU_PART_A64FX 0x001
//
// HISI_CPU_PART_TSV110 0xD01
// APPLE_CPU_PART_M1_ICESTORM 0x022
// APPLE_CPU_PART_M1_FIRESTORM 0x023
// Fixme: After merging the vpu_count branch we could report the
// part here with bli_dolog.
switch(implementer)
{
case 0x41: // ARM
switch (part)
{
#ifdef BLIS_CONFIG_CORTEXA57
case 0xd07: // Cortex A57
return BLIS_ARCH_CORTEXA57;
#endif
#ifdef BLIS_CONFIG_CORTEXA53
case 0xd03: // Cortex A53
return BLIS_ARCH_CORTEXA53;
#endif
#ifdef BLIS_CONFIG_THUNDERX2
case 0xd0c: // Neoverse N1 (and Graviton G2?)
return BLIS_ARCH_THUNDERX2; //placeholder for N1
#endif
}
break;
case 0x42: // Broadcom
switch (part)
{
#ifdef BLIS_CONFIG_THUNDERX2
case 0x516: // Vulcan
return BLIS_ARCH_THUNDERX2;
#endif
}
break;
case 0x43: // Cavium
switch (part)
{
#ifdef BLIS_CONFIG_THUNDERX2
case 0x0af: // ThunderX2
case 0x0b8: // ThunderX3
return BLIS_ARCH_THUNDERX2;
#endif
}
break;
case 0x46: // Fujitsu
switch (part)
{
#ifdef BLIS_CONFIG_A64FX
case 0x001: // A64FX
return BLIS_ARCH_A64FX;
#endif
}
break;
case 0x61: // Apple
switch (part)
{
#ifdef BLIS_CONFIG_FIRESTORM
case 0x022: // Icestorm (M1.LITTLE)
case 0x023: // Firestorm (M1.big)
return BLIS_ARCH_FIRESTORM;
#endif
}
break;
}
#ifdef BLIS_CONFIG_ARMSVE
if (has_sve)
return BLIS_ARCH_ARMSVE;
#endif
// Can't use #if defined(...) here because of parsing done for autoconfiguration
#ifdef BLIS_CONFIG_CORTEXA57
return BLIS_ARCH_CORTEXA57;
#else
#ifdef BLIS_CONFIG_CORTEXA53
return BLIS_ARCH_CORTEXA53;
#else
return BLIS_ARCH_GENERIC;
#endif
#endif
}
uint32_t bli_cpuid_query
(
uint32_t* model,
uint32_t* part,
uint32_t* features
)
{
*model = MODEL_ARMV8;
*part = get_coretype(features);
return VENDOR_ARM;
}
#elif defined(__arm__) || defined(_M_ARM)
/*
I can't easily find documentation to do this as for aarch64, though
it presumably could be unearthed from Linux code. However, on
Linux 5.2 (and Androids's 3.4), /proc/cpuinfo has this sort of
thing, used below:
CPU implementer : 0x41
CPU architecture: 7
CPU variant : 0x3
CPU part : 0xc09
The complication for family selection is that Neon is optional for
CortexA9, for instance. That's tested in bli_cpuid_is_cortexa9.
*/
#define TEMP_BUFFER_SIZE 200
uint32_t bli_cpuid_query
(
uint32_t* model,
uint32_t* part,
uint32_t* features
)
{
*model = MODEL_UNKNOWN;
*part = 0;
*features = 0;
char* pci_str = "/proc/cpuinfo";
char proc_str[ TEMP_BUFFER_SIZE ];
char ptno_str[ TEMP_BUFFER_SIZE ];
char feat_str[ TEMP_BUFFER_SIZE ];
char* r_val;
//printf( "bli_cpuid_query(): beginning search\n" );
// Search /proc/cpuinfo for the 'Processor' entry.
r_val = find_string_in( "Processor", proc_str, TEMP_BUFFER_SIZE, pci_str );
if ( r_val == NULL ) return VENDOR_ARM;
// Search /proc/cpuinfo for the 'CPU part' entry.
r_val = find_string_in( "CPU part", ptno_str, TEMP_BUFFER_SIZE, pci_str );
if ( r_val == NULL ) return VENDOR_ARM;
// Search /proc/cpuinfo for the 'Features' entry.
r_val = find_string_in( "Features", feat_str, TEMP_BUFFER_SIZE, pci_str );
if ( r_val == NULL ) return VENDOR_ARM;
#if 0
printf( "bli_cpuid_query(): full processor string: %s\n", proc_str );
printf( "bli_cpuid_query(): full part num string: %s\n", ptno_str );
printf( "bli_cpuid_query(): full features string: %s\n", feat_str );
#endif
// Parse the feature string to check for SIMD features.
if ( strstr( feat_str, "neon" ) != NULL ||
strstr( feat_str, "asimd" ) != NULL )
*features |= FEATURE_NEON;
// Parse the feature string to check for SVE features.
if ( strstr( feat_str, "sve" ) != NULL )
*features |= FEATURE_SVE;
//printf( "bli_cpuid_query(): features var: %u\n", *features );
// Parse the processor string to uncover the model.
if ( strstr( proc_str, "ARMv7" ) != NULL )
*model = MODEL_ARMV7;
else if ( strstr( proc_str, "AArch64" ) != NULL ||
strstr( proc_str, "ARMv8" ) )
*model = MODEL_ARMV8;
//printf( "bli_cpuid_query(): model: %u\n", *model );
// Parse the part number string.
r_val = strstr( ptno_str, "0x" );
if ( r_val != NULL)
{
*part = strtol( r_val, NULL, 16 );
}
//printf( "bli_cpuid_query(): part#: %x\n", *part );
return VENDOR_ARM;
}
char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath )
{
// This function searches for the first line of the file located at
// 'filepath' that contains the string 'target' and then copies that
// line (actually, the substring of the line starting with 'target')
// to 'buffer', which is 'buf_len' bytes long.
char* r_val = NULL;
// Allocate a temporary local buffer equal to the size of buffer.
char* buf_local = malloc( buf_len * sizeof( char ) );
// Open the file stream.
FILE* stream = fopen( filepath, "r" );
// Repeatedly read in a line from the stream, storing the contents of
// the stream into buf_local.
while ( !feof( stream ) )
{
// Read in the current line, up to buf_len-1 bytes.
r_val = fgets( buf_local, buf_len-1, stream );
//printf( "read line: %s", buf_local );
// fgets() returns the pointer specified by the first argument (in
// this case, buf_local) on success and NULL on error.
if ( r_val == NULL ) break;
// Since fgets() was successful, we can search for the target string
// within the current line, as captured in buf_local.
r_val = strstr( buf_local, target );
// If the target string was found in buf_local, we save it to buffer.
if ( r_val != NULL )
{
//printf( " found match to '%s'\n", target );
// Copy the string read by fgets() to the caller's buffer.
strncpy( buffer, buf_local, buf_len );
// Make sure that we have a terminating null character by the
// end of the buffer.
if ( buf_len > 0 ) buffer[ buf_len - 1 ] = '\0';
// Leave the loop since we found the target string.
break;
}
}
// Close the file stream.
fclose( stream );
// Free the temporary local buffer.
free( buf_local );
// Return r_val so the caller knows if we failed.
return r_val;
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_cpuid.h 0000664 0000000 0000000 00000015323 14634250137 0021742 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if 0
// Used only during standalone testing of ARM support.
#define FALSE 0
#define TRUE 1
typedef enum
{
BLIS_ARCH_CORTEXA57 = 10,
BLIS_ARCH_CORTEXA15 = 11,
BLIS_ARCH_CORTEXA9 = 12,
BLIS_ARCH_GENERIC = 13
} arch_t;
typedef uint64_t bool;
#define bli_abort abort
#endif
#ifndef BLIS_CPUID_H
#define BLIS_CPUID_H
arch_t bli_cpuid_query_id( void );
// Intel
bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features );
// AMD
bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features );
bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features );
// ARM
bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features );
bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );
uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features );
// -----------------------------------------------------------------------------
//
// This section of the file was based off of cpuid.hpp from TBLIS [1].
//
// [1] https://github.com/devinamatthews/tblis
//
/*
Copyright (C) 2017, The University of Texas at Austin
Copyright (C) 2017, Devin Matthews
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want )
{
return ( have & want ) == want;
}
// -----------------------------------------------------------------------------
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
// cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393
// for more information why this move was made.
//#include "cpuid.h"
void get_cpu_name( char *cpu_name );
int vpu_count( void );
enum
{
VENDOR_INTEL = 0,
VENDOR_AMD,
VENDOR_UNKNOWN
};
enum
{
FEATURE_SSE3 = 0x0001,
FEATURE_SSSE3 = 0x0002,
FEATURE_SSE41 = 0x0004,
FEATURE_SSE42 = 0x0008,
FEATURE_AVX = 0x0010,
FEATURE_AVX2 = 0x0020,
FEATURE_FMA3 = 0x0040,
FEATURE_FMA4 = 0x0080,
FEATURE_AVX512F = 0x0100,
FEATURE_AVX512DQ = 0x0200,
FEATURE_AVX512PF = 0x0400,
FEATURE_AVX512ER = 0x0800,
FEATURE_AVX512CD = 0x1000,
FEATURE_AVX512BW = 0x2000,
FEATURE_AVX512VL = 0x4000
};
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath );
enum
{
VENDOR_ARM = 0,
VENDOR_UNKNOWN
};
enum
{
MODEL_ARMV7 = 0,
MODEL_ARMV8,
MODEL_UNKNOWN
};
enum
{
FEATURE_NEON = 0x01,
FEATURE_SVE = 0x02
};
#endif
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_env.c 0000664 0000000 0000000 00000010613 14634250137 0021416 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef BLIS_CONFIGURETIME_CPUID
// NOTE: If you need to make any changes to this cpp branch, it's probably
// the case that you also need to modify bli_arch.c, bli_cpuid.c, and
// bli_env.c. Don't forget to update these other files as needed!
// The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp
// branch in bli_system.h is processed. (This macro is normally defined in
// bli_config.h.)
#define BLIS_ENABLE_SYSTEM
// Use C-style static inline functions for any static inline functions that
// happen to be defined by the headers below. (This macro is normally defined
// in bli_config_macro_defs.h.)
#define BLIS_INLINE static
// Since we're not building a shared library, we can forgo the use of the
// BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro
// is normally defined in bli_config_macro_defs.h.)
#define BLIS_EXPORT_BLIS
#include "bli_system.h"
#include "bli_type_defs.h"
//#include "bli_arch.h"
//#include "bli_cpuid.h"
#include "bli_env.h"
#else
#include "blis.h"
#endif
// -----------------------------------------------------------------------------
gint_t bli_env_get_var( const char* env, gint_t fallback )
{
gint_t r_val;
char* str;
// Query the environment variable and store the result in str.
str = getenv( env );
// Set the return value based on the string obtained from getenv().
if ( str != NULL )
{
// If there was no error, convert the string to an integer and
// prepare to return that integer.
r_val = ( gint_t )strtol( str, NULL, 10 );
}
else
{
// If there was an error, use the "fallback" as the return value.
r_val = fallback;
}
return r_val;
}
#if 0
#ifdef _MSC_VER
#define strerror_r(errno,buf,len) strerror_s(buf,len,errno)
#endif
void bli_env_set_var( const char* env, dim_t value )
{
dim_t r_val;
char value_str[32];
const char* fs_32 = "%u";
const char* fs_64 = "%lu";
// Convert the string to an integer, but vary the format specifier
// depending on the integer type size.
if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value );
else sprintf( value_str, fs_64, value );
// Set the environment variable using the string we just wrote to via
// sprintf(). (The 'TRUE' argument means we want to overwrite the current
// value if the environment variable already exists.)
r_val = bli_setenv( env, value_str, TRUE );
// Check the return value in case something went horribly wrong.
if ( r_val == -1 )
{
char err_str[128];
// Query the human-readable error string corresponding to errno.
strerror_r( errno, err_str, 128 );
// Print the error message.
bli_print_msg( err_str, __FILE__, __LINE__ );
}
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_env.h 0000664 0000000 0000000 00000003643 14634250137 0021430 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_ENV_H
#define BLIS_ENV_H
gint_t bli_env_get_var( const char* env, gint_t fallback );
//void bli_env_set_var( const char* env, dim_t value );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_error.c 0000664 0000000 0000000 00000020677 14634250137 0021772 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Internal array to hold error strings.
static char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
{
[-BLIS_INVALID_ERROR_CHECKING_LEVEL] = "Invalid error checking level.",
[-BLIS_UNDEFINED_ERROR_CODE] = "Undefined error code.",
[-BLIS_NULL_POINTER] = "Encountered unexpected null pointer.",
[-BLIS_NOT_YET_IMPLEMENTED] = "Requested functionality not yet implemented.",
[-BLIS_INVALID_SIDE] = "Invalid side parameter value.",
[-BLIS_INVALID_UPLO] = "Invalid uplo_t parameter value.",
[-BLIS_INVALID_TRANS] = "Invalid trans_t parameter value.",
[-BLIS_INVALID_CONJ] = "Invalid conj_t parameter value.",
[-BLIS_INVALID_DIAG] = "Invalid diag_t parameter value.",
[-BLIS_EXPECTED_NONUNIT_DIAG] = "Expected object with non-unit diagonal.",
[-BLIS_INVALID_DATATYPE] = "Invalid datatype value.",
[-BLIS_EXPECTED_FLOATING_POINT_DATATYPE] = "Expected floating-point datatype value.",
[-BLIS_EXPECTED_NONINTEGER_DATATYPE] = "Expected non-integer datatype value.",
[-BLIS_EXPECTED_NONCONSTANT_DATATYPE] = "Expected non-constant datatype value.",
[-BLIS_EXPECTED_REAL_DATATYPE] = "Expected real datatype value.",
[-BLIS_EXPECTED_INTEGER_DATATYPE] = "Expected integer datatype value.",
[-BLIS_INCONSISTENT_DATATYPES] = "Expected consistent datatypes (equal, or one being constant).",
[-BLIS_EXPECTED_REAL_PROJ_OF] = "Expected second datatype to be real projection of first.",
[-BLIS_EXPECTED_REAL_VALUED_OBJECT] = "Expected real-valued object (ie: if complex, imaginary component equals zero).",
[-BLIS_INCONSISTENT_PRECISIONS] = "Expected consistent precisions (both single or both double).",
[-BLIS_NONCONFORMAL_DIMENSIONS] = "Encountered non-conformal dimensions between objects.",
[-BLIS_EXPECTED_SCALAR_OBJECT] = "Expected scalar object.",
[-BLIS_EXPECTED_VECTOR_OBJECT] = "Expected vector object.",
[-BLIS_UNEQUAL_VECTOR_LENGTHS] = "Encountered unequal vector lengths.",
[-BLIS_EXPECTED_SQUARE_OBJECT] = "Expected square object.",
[-BLIS_UNEXPECTED_OBJECT_LENGTH] = "Unexpected object length.",
[-BLIS_UNEXPECTED_OBJECT_WIDTH] = "Unexpected object width.",
[-BLIS_UNEXPECTED_VECTOR_DIM] = "Unexpected vector dimension.",
[-BLIS_UNEXPECTED_DIAG_OFFSET] = "Unexpected object diagonal offset.",
[-BLIS_NEGATIVE_DIMENSION] = "Encountered negative dimension.",
[-BLIS_INVALID_ROW_STRIDE] = "Encountered invalid row stride relative to n dimension.",
[-BLIS_INVALID_COL_STRIDE] = "Encountered invalid col stride relative to m dimension.",
[-BLIS_INVALID_DIM_STRIDE_COMBINATION] = "Encountered invalid stride/dimension combination.",
[-BLIS_EXPECTED_GENERAL_OBJECT] = "Expected general object.",
[-BLIS_EXPECTED_HERMITIAN_OBJECT] = "Expected Hermitian object.",
[-BLIS_EXPECTED_SYMMETRIC_OBJECT] = "Expected symmetric object.",
[-BLIS_EXPECTED_TRIANGULAR_OBJECT] = "Expected triangular object.",
[-BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT] = "Expected upper or lower triangular object.",
[-BLIS_INVALID_3x1_SUBPART] = "Encountered invalid 3x1 (vertical) subpartition label.",
[-BLIS_INVALID_1x3_SUBPART] = "Encountered invalid 1x3 (horizontal) subpartition label.",
[-BLIS_INVALID_3x3_SUBPART] = "Encountered invalid 3x3 (diagonal) subpartition label.",
[-BLIS_UNEXPECTED_NULL_CONTROL_TREE] = "Encountered unexpected null control tree node.",
[-BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK] = "Pack schema not yet supported/implemented for use with unpacking.",
[-BLIS_EXPECTED_NONNULL_OBJECT_BUFFER] = "Encountered object with non-zero dimensions containing null buffer.",
[-BLIS_MALLOC_RETURNED_NULL] = "malloc() returned NULL; heap memory is likely exhausted.",
[-BLIS_INVALID_PACKBUF] = "Invalid packbuf_t value.",
[-BLIS_EXHAUSTED_CONTIG_MEMORY_POOL] = "Attempted to allocate more memory from contiguous pool than is available.",
[-BLIS_INSUFFICIENT_STACK_BUF_SIZE] = "Configured maximum stack buffer size is insufficient for register blocksizes currently in use.",
[-BLIS_ALIGNMENT_NOT_POWER_OF_TWO] = "Encountered memory alignment value that is either zero or not a power of two.",
[-BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE] = "Encountered memory alignment value that is not a multiple of sizeof(void*).",
[-BLIS_EXPECTED_OBJECT_ALIAS] = "Expected object to be alias.",
[-BLIS_INVALID_ARCH_ID] = "Invalid architecture id value.",
[-BLIS_UNINITIALIZED_GKS_CNTX] = "Accessed uninitialized context in gks; BLIS_ARCH_TYPE is probably set to an invalid architecture id.",
[-BLIS_MC_DEF_NONMULTIPLE_OF_MR] = "Default MC is non-multiple of MR for one or more datatypes.",
[-BLIS_MC_MAX_NONMULTIPLE_OF_MR] = "Maximum MC is non-multiple of MR for one or more datatypes.",
[-BLIS_NC_DEF_NONMULTIPLE_OF_NR] = "Default NC is non-multiple of NR for one or more datatypes.",
[-BLIS_NC_MAX_NONMULTIPLE_OF_NR] = "Maximum NC is non-multiple of NR for one or more datatypes.",
[-BLIS_KC_DEF_NONMULTIPLE_OF_KR] = "Default KC is non-multiple of KR for one or more datatypes.",
[-BLIS_KC_MAX_NONMULTIPLE_OF_KR] = "Maximum KC is non-multiple of KR for one or more datatypes.",
};
// -----------------------------------------------------------------------------
void bli_print_msg( char* str, char* file, guint_t line )
{
fprintf( stderr, "\n" );
fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
fprintf( stderr, "libblis: %s\n", str );
fflush( stderr );
}
void bli_abort( void )
{
fprintf( stderr, "libblis: Aborting.\n" );
//raise( SIGABRT );
abort();
}
// -----------------------------------------------------------------------------
// Current error checking level.
static BLIS_THREAD_LOCAL errlev_t bli_err_chk_level = BLIS_FULL_ERROR_CHECKING;
errlev_t bli_error_checking_level( void )
{
return bli_err_chk_level;
}
void bli_error_checking_level_set( errlev_t new_level )
{
err_t e_val;
e_val = bli_check_valid_error_level( new_level );
bli_check_error_code( e_val );
bli_err_chk_level = new_level;
}
bool bli_error_checking_is_enabled( void )
{
return bli_error_checking_level() != BLIS_NO_ERROR_CHECKING;
}
char* bli_error_string_for_code( gint_t code )
{
return bli_error_string[-code];
}
cython-blis-1.0.0/blis/_src/frame/base/bli_error.h 0000664 0000000 0000000 00000004116 14634250137 0021765 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void );
BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level );
BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void );
void bli_print_msg( char* str, char* file, guint_t line );
BLIS_EXPORT_BLIS void bli_abort( void );
char* bli_error_string_for_code( gint_t code );
cython-blis-1.0.0/blis/_src/frame/base/bli_func.c 0000664 0000000 0000000 00000006142 14634250137 0021563 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
func_t* bli_func_create
(
void_fp ptr_s,
void_fp ptr_d,
void_fp ptr_c,
void_fp ptr_z
)
{
func_t* f;
err_t r_val;
f = ( func_t* )bli_malloc_intl( sizeof( func_t ), &r_val );
bli_func_init
(
f,
ptr_s,
ptr_d,
ptr_c,
ptr_z
);
return f;
}
void bli_func_init
(
func_t* f,
void_fp ptr_s,
void_fp ptr_d,
void_fp ptr_c,
void_fp ptr_z
)
{
bli_func_set_dt( ptr_s, BLIS_FLOAT, f );
bli_func_set_dt( ptr_d, BLIS_DOUBLE, f );
bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f );
bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f );
}
void bli_func_init_null
(
func_t* f
)
{
bli_func_set_dt( NULL, BLIS_FLOAT, f );
bli_func_set_dt( NULL, BLIS_DOUBLE, f );
bli_func_set_dt( NULL, BLIS_SCOMPLEX, f );
bli_func_set_dt( NULL, BLIS_DCOMPLEX, f );
}
void bli_func_free( func_t* f )
{
bli_free_intl( f );
}
// -----------------------------------------------------------------------------
bool bli_func_is_null_dt( num_t dt,
func_t* f )
{
return ( bli_func_get_dt( dt, f ) == NULL );
}
bool bli_func_is_null( func_t* f )
{
bool r_val = TRUE;
num_t dt;
// Iterate over all floating-point datatypes. If any is non-null,
// return FALSE. Otherwise, if they are all null, return TRUE.
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
if ( bli_func_get_dt( dt, f ) != NULL )
{
r_val = FALSE;
break;
}
}
return r_val;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_func.h 0000664 0000000 0000000 00000005542 14634250137 0021573 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -----------------------------------------------------------------------------
// func_t query
BLIS_INLINE void_fp bli_func_get_dt
(
num_t dt,
func_t* func
)
{
return func->ptr[ dt ];
}
// func_t modification
BLIS_INLINE void bli_func_set_dt
(
void_fp fp,
num_t dt,
func_t* func
)
{
func->ptr[ dt ] = fp;
}
BLIS_INLINE void bli_func_copy_dt
(
num_t dt_src, func_t* func_src,
num_t dt_dst, func_t* func_dst
)
{
void_fp fp = bli_func_get_dt( dt_src, func_src );
bli_func_set_dt( fp, dt_dst, func_dst );
}
// -----------------------------------------------------------------------------
func_t* bli_func_create
(
void_fp ptr_s,
void_fp ptr_d,
void_fp ptr_c,
void_fp ptr_z
);
void bli_func_init
(
func_t* f,
void_fp ptr_s,
void_fp ptr_d,
void_fp ptr_c,
void_fp ptr_z
);
void bli_func_init_null
(
func_t* f
);
void bli_func_free( func_t* f );
// -----------------------------------------------------------------------------
bool bli_func_is_null_dt( num_t dt,
func_t* f );
bool bli_func_is_null( func_t* f );
cython-blis-1.0.0/blis/_src/frame/base/bli_getopt.c 0000664 0000000 0000000 00000014114 14634250137 0022130 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static const char OPT_MARKER = '-';
void bli_getopt_init_state( int opterr, getopt_t* state )
{
state->optarg = NULL;
state->optind = 1;
state->opterr = opterr;
state->optopt = 0;
}
int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state )
{
static char* nextchar = NULL;
char* elem_str;
char* optstr_char;
// If argv contains no more arguments to process, return.
if ( state->optind == argc ) return -1;
// Get a pointer to the current argv element string to process. If
// nextchar is non-NULL, then it means the previous call processed
// an element of argv with more than one option character, in which
// case we need to pick up where we left off (which is the address
// contained in nextchar).
if ( nextchar == NULL )
{
elem_str = argv[ state->optind ];
// elem_str[0] should be an OPT_MARKER if it is an option. In the
// event that it is not an option, argv should be permuted so that
// the non-option argument moves back toward the end of the list.
// This functionality is not supported/implemented here. Therefore,
// we require all of the program's option arguments to precede all of
// its non-option arguments.
if ( elem_str[0] != OPT_MARKER )
{
state->optarg = NULL;
//state->optind += 1;
return -1;
}
// Skip over the OPT_MARKER.
elem_str++;
}
else
{
// Note we don't need to skip the OPT_MARKER here since we are
// continuing processing of a string with more than one option
// character.
// Use the nextchar pointer as our element string.
elem_str = nextchar;
// Reset nextchar to NULL.
nextchar = NULL;
}
// Find the first occurrence of elem_str[0] in optstring.
optstr_char = strchr( optstring, elem_str[0] );
// If the option character in elem_str[0] is absent from the option
// string, store it and return '?'.
if ( optstr_char == NULL )
{
if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' missing from option string \"%s\"\n", elem_str[0], optstring );
// We can't dereference optstr_char since it is NULL, so we use
// elem_str[0] instead.
state->optopt = elem_str[0];
state->optind += 1;
return '?';
}
// We can now safely assume that an option characer was found in the
// option string. Now we need to check if the option takes an argument.
if ( optstr_char[1] == ':' )
{
// If the current element string ends after the option character,
// then the companion argument must be stored in the next element
// of argv. Otherwise, the argument begins immediately after the
// option character.
if ( elem_str[1] == '\0' )
{
// If there are no more elements in argv, the argument was
// omitted. Store the corresponding option character and
// return '?'.
if ( state->optind + 1 >= argc )
{
if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (end of argv)\n", elem_str[0] );
state->optopt = *optstr_char;
state->optind += 1;
return '?';
}
// If there are still more elements in argv yet to process AND
// the next one is an option, then the argument was omitted.
else if ( argv[ state->optind + 1 ][0] == OPT_MARKER )
{
if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (next element of argv is option '%c')\n", elem_str[0], argv[ state->optind + 1 ][1] );
state->optopt = *optstr_char;
state->optind += 1;
return '?';
}
// If no error was deteced above, we can safely assign optarg
// to be the next element in argv and increment optind by two.
state->optarg = argv[ state->optind + 1 ];
state->optind += 2;
}
else
{
// We don't need to check for missing arguments since we know
// that because the char after the option character is not NULL,
// the character(s) after it must constitute the argument.
state->optarg = &elem_str[1];
state->optind += 1;
}
return *optstr_char;
}
// The current option character does NOT take an argument. However, we
// still need to check if the next char is an option argument (such as
// occurs when the user runs "program -rv" instead of "program -r -v").
if ( elem_str[1] != '\0' )
{
if ( strchr( optstring, elem_str[1] ) != NULL )
{
nextchar = &elem_str[1];
return *optstr_char;
}
}
state->optarg = NULL;
state->optind += 1;
return *optstr_char;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_getopt.h 0000664 0000000 0000000 00000003642 14634250137 0022141 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
typedef struct getopt_s
{
char* optarg;
int optind;
int opterr;
int optopt;
} getopt_t;
BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state );
BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state );
cython-blis-1.0.0/blis/_src/frame/base/bli_gks.c 0000664 0000000 0000000 00000067367 14634250137 0021434 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018-2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// The array of cntx_t* pointers to cache modified contexts used by
// induced methods.
static cntx_t** gks[ BLIS_NUM_ARCHS ];
// The array of function pointers holding the registered context initialization
// functions for induced methods.
static void_fp cntx_ind_init[ BLIS_NUM_ARCHS ];
// The array of function pointers holding the registered context initialization
// functions for reference kernels.
static void_fp cntx_ref_init[ BLIS_NUM_ARCHS ];
// Define a function pointer type for context initialization functions.
typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
// -----------------------------------------------------------------------------
void bli_gks_init( void )
{
{
// Initialize the internal data structure we use to track registered
// contexts.
bli_gks_init_index();
// Register a context for each architecture that was #define'd in
// bli_config.h.
// Intel architectures
#ifdef BLIS_CONFIG_SKX
bli_gks_register_cntx( BLIS_ARCH_SKX, bli_cntx_init_skx,
bli_cntx_init_skx_ref,
bli_cntx_init_skx_ind );
#endif
#ifdef BLIS_CONFIG_KNL
bli_gks_register_cntx( BLIS_ARCH_KNL, bli_cntx_init_knl,
bli_cntx_init_knl_ref,
bli_cntx_init_knl_ind );
#endif
#ifdef BLIS_CONFIG_KNC
bli_gks_register_cntx( BLIS_ARCH_KNC, bli_cntx_init_knc,
bli_cntx_init_knc_ref,
bli_cntx_init_knc_ind );
#endif
#ifdef BLIS_CONFIG_HASWELL
bli_gks_register_cntx( BLIS_ARCH_HASWELL, bli_cntx_init_haswell,
bli_cntx_init_haswell_ref,
bli_cntx_init_haswell_ind );
#endif
#ifdef BLIS_CONFIG_SANDYBRIDGE
bli_gks_register_cntx( BLIS_ARCH_SANDYBRIDGE, bli_cntx_init_sandybridge,
bli_cntx_init_sandybridge_ref,
bli_cntx_init_sandybridge_ind );
#endif
#ifdef BLIS_CONFIG_PENRYN
bli_gks_register_cntx( BLIS_ARCH_PENRYN, bli_cntx_init_penryn,
bli_cntx_init_penryn_ref,
bli_cntx_init_penryn_ind );
#endif
// AMD architectures
#ifdef BLIS_CONFIG_ZEN3
bli_gks_register_cntx( BLIS_ARCH_ZEN3, bli_cntx_init_zen3,
bli_cntx_init_zen3_ref,
bli_cntx_init_zen3_ind );
#endif
#ifdef BLIS_CONFIG_ZEN2
bli_gks_register_cntx( BLIS_ARCH_ZEN2, bli_cntx_init_zen2,
bli_cntx_init_zen2_ref,
bli_cntx_init_zen2_ind );
#endif
#ifdef BLIS_CONFIG_ZEN
bli_gks_register_cntx( BLIS_ARCH_ZEN, bli_cntx_init_zen,
bli_cntx_init_zen_ref,
bli_cntx_init_zen_ind );
#endif
#ifdef BLIS_CONFIG_EXCAVATOR
bli_gks_register_cntx( BLIS_ARCH_EXCAVATOR, bli_cntx_init_excavator,
bli_cntx_init_excavator_ref,
bli_cntx_init_excavator_ind );
#endif
#ifdef BLIS_CONFIG_STEAMROLLER
bli_gks_register_cntx( BLIS_ARCH_STEAMROLLER, bli_cntx_init_steamroller,
bli_cntx_init_steamroller_ref,
bli_cntx_init_steamroller_ind );
#endif
#ifdef BLIS_CONFIG_PILEDRIVER
bli_gks_register_cntx( BLIS_ARCH_PILEDRIVER, bli_cntx_init_piledriver,
bli_cntx_init_piledriver_ref,
bli_cntx_init_piledriver_ind );
#endif
#ifdef BLIS_CONFIG_BULLDOZER
bli_gks_register_cntx( BLIS_ARCH_BULLDOZER, bli_cntx_init_bulldozer,
bli_cntx_init_bulldozer_ref,
bli_cntx_init_bulldozer_ind );
#endif
// ARM architectures
#ifdef BLIS_CONFIG_A64FX
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
bli_cntx_init_a64fx_ref,
bli_cntx_init_a64fx_ind );
#endif
#ifdef BLIS_CONFIG_THUNDERX2
bli_gks_register_cntx( BLIS_ARCH_THUNDERX2, bli_cntx_init_thunderx2,
bli_cntx_init_thunderx2_ref,
bli_cntx_init_thunderx2_ind );
#endif
#ifdef BLIS_CONFIG_CORTEXA57
bli_gks_register_cntx( BLIS_ARCH_CORTEXA57, bli_cntx_init_cortexa57,
bli_cntx_init_cortexa57_ref,
bli_cntx_init_cortexa57_ind );
#endif
#ifdef BLIS_CONFIG_CORTEXA53
bli_gks_register_cntx( BLIS_ARCH_CORTEXA53, bli_cntx_init_cortexa53,
bli_cntx_init_cortexa53_ref,
bli_cntx_init_cortexa53_ind );
#endif
#ifdef BLIS_CONFIG_ARMSVE
bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve,
bli_cntx_init_armsve_ref,
bli_cntx_init_armsve_ind );
#endif
#ifdef BLIS_CONFIG_A64FX
bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx,
bli_cntx_init_a64fx_ref,
bli_cntx_init_a64fx_ind );
#endif
#ifdef BLIS_CONFIG_FIRESTORM
bli_gks_register_cntx( BLIS_ARCH_FIRESTORM, bli_cntx_init_firestorm,
bli_cntx_init_firestorm_ref,
bli_cntx_init_firestorm_ind );
#endif
#ifdef BLIS_CONFIG_CORTEXA15
bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15,
bli_cntx_init_cortexa15_ref,
bli_cntx_init_cortexa15_ind );
#endif
#ifdef BLIS_CONFIG_CORTEXA9
bli_gks_register_cntx( BLIS_ARCH_CORTEXA9, bli_cntx_init_cortexa9,
bli_cntx_init_cortexa9_ref,
bli_cntx_init_cortexa9_ind );
#endif
// IBM architectures
#ifdef BLIS_CONFIG_POWER10
bli_gks_register_cntx( BLIS_ARCH_POWER10, bli_cntx_init_power10,
bli_cntx_init_power10_ref,
bli_cntx_init_power10_ind );
#endif
#ifdef BLIS_CONFIG_POWER9
bli_gks_register_cntx( BLIS_ARCH_POWER9, bli_cntx_init_power9,
bli_cntx_init_power9_ref,
bli_cntx_init_power9_ind );
#endif
#ifdef BLIS_CONFIG_POWER7
bli_gks_register_cntx( BLIS_ARCH_POWER7, bli_cntx_init_power7,
bli_cntx_init_power7_ref,
bli_cntx_init_power7_ind );
#endif
#ifdef BLIS_CONFIG_BGQ
bli_gks_register_cntx( BLIS_ARCH_BGQ, bli_cntx_init_bgq,
bli_cntx_init_bgq_ref,
bli_cntx_init_bgq_ind );
#endif
// Generic architectures
#ifdef BLIS_CONFIG_GENERIC
bli_gks_register_cntx( BLIS_ARCH_GENERIC, bli_cntx_init_generic,
bli_cntx_init_generic_ref,
bli_cntx_init_generic_ind );
#endif
}
}
// -----------------------------------------------------------------------------
void bli_gks_finalize( void )
{
arch_t id;
ind_t ind;
// BEGIN CRITICAL SECTION
// NOTE: This critical section is implicit. We assume this function is only
// called from within the critical section within bli_finalize().
{
// Iterate over the architectures in the gks array.
for ( id = 0; id < BLIS_NUM_ARCHS; ++id )
{
cntx_t** restrict gks_id = gks[ id ];
// Only consider context arrays for architectures that were allocated
// in the first place.
if ( gks_id != NULL )
{
// Iterate over the induced methods in the current sub-array
// referenced by cntx_pp.
for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind )
{
cntx_t* restrict gks_id_ind = gks_id[ ind ];
// If the current context was allocated, free it.
if ( gks_id_ind != NULL )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_gks_finalize(): cntx for ind_t %d: ", ( int )ind );
#endif
bli_free_intl( gks_id_ind );
}
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_gks_finalize(): gks for arch_t %d: ", ( int )id );
#endif
// Free the array of BLIS_NUM_IND_METHODS cntx* elements.
bli_free_intl( gks_id );
}
}
}
// END CRITICAL SECTION
}
// -----------------------------------------------------------------------------
void bli_gks_init_index( void )
{
// This function is called by bli_gks_init(). It simply initializes all
// architecture id elements of the internal arrays to NULL.
const size_t gks_size = sizeof( cntx_t* ) * BLIS_NUM_ARCHS;
const size_t fpa_size = sizeof( void_fp ) * BLIS_NUM_ARCHS;
// Set every entry in gks and context init function pointer arrays to
// zero/NULL. This is done so that later on we know which ones were
// allocated.
memset( gks, 0, gks_size );
memset( cntx_ref_init, 0, fpa_size );
memset( cntx_ind_init, 0, fpa_size );
}
// -----------------------------------------------------------------------------
cntx_t* bli_gks_lookup_nat_cntx
(
arch_t id
)
{
// Return the address of the (native) context for a given architecture id.
// This function assumes the architecture has already been registered.
return bli_gks_lookup_ind_cntx( id, BLIS_NAT );
}
// -----------------------------------------------------------------------------
cntx_t* bli_gks_lookup_ind_cntx
(
arch_t id,
ind_t ind
)
{
// Return the address of the context for a given architecture id and
// induced method. This function assumes the architecture has already
// been registered. Note that this function returns NULL if the induced
// method hasn't yet been called (and thus its context pointer is still
// NULL).
// Sanity check: verify that the arch_t id is valid.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_valid_arch_id( id );
bli_check_error_code( e_val );
}
// Index into the array of context pointers for the given architecture id,
// and then index into the subarray for the given induced method.
cntx_t** restrict gks_id = gks[ id ];
cntx_t* restrict gks_id_ind = gks_id[ ind ];
// Return the context pointer at gks_id_ind.
return gks_id_ind;
}
// -----------------------------------------------------------------------------
cntx_t** bli_gks_lookup_id
(
arch_t id
)
{
// Return the address of the array of context pointers for a given
// architecture id. This function is only used for sanity check purposes
// to ensure that the underlying data structures for a particular id are
// initialized.
// Index into the array of context pointers for the given architecture id.
cntx_t** restrict gks_id = gks[ id ];
// Return the context pointer at gks_id_ind.
return gks_id;
}
// -----------------------------------------------------------------------------
void bli_gks_register_cntx
(
arch_t id,
void_fp nat_fp,
void_fp ref_fp,
void_fp ind_fp
)
{
err_t r_val;
// This function is called by bli_gks_init() for each architecture that
// will be supported by BLIS. It takes an architecture id and three
// function pointers, one to a function that initializes a native context
// (supplied by the kernel developer), one to a function that initializes
// a reference context (with function pointers specific to the architecture
// associated with id), and one to a function that initializes a
// context for use with induced methods (again, with function pointers
// to the architecture). The latter two functions are automatically
// generated by the framework. Unlike with native contexts, we don't
// actually store the induced contexts until that induced method is
// called, and we don't ever store reference contexts. For this reason, we
// can get away with only storing the pointers to the initialization
// functions for those latter two types of contexts, which we can then
// call at a later time when those contexts are needed.
// Sanity check: verify that the arch_t id is valid.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_valid_arch_id( id );
bli_check_error_code( e_val );
}
nat_cntx_init_ft f = nat_fp;
// First, store the function pointers to the context initialization
// functions for reference kernels and induced method execution. The
// former will be used whenever we need to obtain reference kernels and
// latter will be used later on if the user calls a level-3 function
// with induced execution enabled.
cntx_ref_init[ id ] = ref_fp;
cntx_ind_init[ id ] = ind_fp;
// If the the context array pointer isn't NULL, then it means the given
// architecture id has already registered (and the underlying memory
// allocations and context initializations have already been performed).
// This is really just a safety feature to prevent memory leaks; this
// early return should never occur, because the caller should never try
// to register with an architecture id that has already been registered.
if ( gks[ id ] != NULL ) return;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_gks_register_cntx(): " );
#endif
// At this point, we know the pointer to the array of cntx_t* is NULL and
// needs to be allocated. Allocate the memory and initialize it to
// zeros/NULL, storing the address of the alloacted memory at the element
// for the current architecture id.
gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val );
// Alias the allocated array for readability.
cntx_t** restrict gks_id = gks[ id ];
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_gks_register_cntx(): " );
#endif
// Allocate memory for a single context and store the address at
// the element in the gks[ id ] array that is reserved for native
// execution.
gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val );
// Alias the allocated context address for readability.
cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ];
// Call the context initialization function on the element of the newly
// allocated array corresponding to native execution.
f( gks_id_nat );
// Verify that cache blocksizes are whole multiples of register blocksizes.
// Specifically, verify that:
// - MC is a whole multiple of MR.
// - NC is a whole multiple of NR.
// - KC is a whole multiple of KR.
// These constraints are enforced because it makes it easier to handle diagonals
// in the macro-kernel implementations. Additionally, we optionally verify that:
// - MC is a whole multiple of NR.
// - NC is a whole multiple of MR.
// These latter constraints, guarded by #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS
// below, are only enforced when we wish to be able to handle the trsm right-
// side case handling that swaps A and B, so that B is the triangular matrix,
// with NR blocking used to pack A and MR blocking used to pack B, with the
// arguments to the gemmtrsm microkernel swapped at the last minute, as the
// kernel is called.
err_t e_val;
blksz_t* restrict mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat );
blksz_t* restrict nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat );
blksz_t* restrict kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat );
blksz_t* restrict mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat );
blksz_t* restrict nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat );
blksz_t* restrict kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat );
e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val );
e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val );
e_val = bli_check_valid_kc_mod_mult( kc, kr ); bli_check_error_code( e_val );
#ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS
e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val );
e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val );
#endif
// Verify that the register blocksizes in the context are sufficiently large
// relative to the maximum stack buffer size defined at configure-time.
e_val = bli_check_sufficient_stack_buf_size( gks_id_nat );
bli_check_error_code( e_val );
}
// -----------------------------------------------------------------------------
cntx_t* bli_gks_query_cntx( void )
{
return bli_gks_query_nat_cntx();
}
cntx_t* bli_gks_query_nat_cntx( void )
{
bli_init_once();
// Return the address of the native context for the architecture id
// corresponding to the current hardware, as determined by
// bli_arch_query_id().
// Query the architecture id.
arch_t id = bli_arch_query_id();
// Use the architecture id to look up a pointer to its context.
cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
return cntx;
}
// -----------------------------------------------------------------------------
cntx_t* bli_gks_query_cntx_noinit( void )
{
// This function is identical to bli_gks_query_cntx(), except that it
// does not call bli_init_once().
// Query the architecture id.
arch_t id = bli_arch_query_id();
// Use the architecture id to look up a pointer to its context.
cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
return cntx;
}
// -----------------------------------------------------------------------------
// A mutex to allow synchronous access to the gks when it needs to be updated
// with a new entry corresponding to a context for an ind_t value.
static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
cntx_t* bli_gks_query_ind_cntx
(
ind_t ind,
num_t dt
)
{
bli_init_once();
cntx_t* gks_id_ind;
err_t r_val;
// Return the address of a context that will be suited for executing a
// level-3 operation via the requested induced method (and datatype) for
// the architecture id corresponding to the current hardware, as
// determined by bli_arch_query_id().
// This function is called when a level-3 operation via induced method is
// called, e.g. bli_gemm1m(). If this is the first time that induced method
// is being executed since bli_gks_init(), the necessary context structure
// is allocated and initialized. If this is not the first time, then the
// address of a previously-allocated and initialized (cached) context is
// returned. Note that much of this must be done with mutual exclusion to
// ensure thread safety and deterministic behavior.
// Query the architecture id.
arch_t id = bli_arch_query_id();
// Sanity check: verify that the arch_t id is valid.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_valid_arch_id( id );
bli_check_error_code( e_val );
}
// NOTE: These initial statements can reside outside of the critical section
// because gks[ id ] should have already been allocated, and the native
// context in that array should have already been allocated/initialized.
// Query the gks for the array of context pointers corresponding to the
// given architecture id.
cntx_t** restrict gks_id = gks[ id ];
cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ];
// If for some reason the native context was requested, we can return
// its address early.
if ( ind == BLIS_NAT ) return gks_id_nat;
// This function assumes that the architecture idenified by id has
// already been registered with the gks (which guarantees that
// gks[ id ] is non-NULL and gks[ id ][ BLIS_NAT ] is also non-NULL
// and refers to a context initialized with valid data).
// Acquire the mutex protecting the gks.
bli_pthread_mutex_lock( &gks_mutex );
// BEGIN CRITICAL SECTION
{
// Alias for readability the element of gks_id associated with the
// requested induced method.
gks_id_ind = gks_id[ ind ];
// If the context pointer is NULL, then we know we must allocate and
// then initialize the context before returning its address.
if ( gks_id_ind == NULL )
{
// If gks_id_ind is NULL, then we know we must allocate and then
// initialize the context, storing its address back to
// gks_id[ ind ].
gks_id_ind = bli_calloc_intl( sizeof( cntx_t ), &r_val );
gks_id[ ind ] = gks_id_ind;
// Before we can call the induced method context initialization
// function on the newly allocated structure, we must first copy
// over the contents of the native context.
*gks_id_ind = *gks_id_nat;
// Use the architecture id to look up the function pointer to the
// context initialization function for induced methods.
ind_cntx_init_ft f = cntx_ind_init[ id ];
// Now we modify the context (so that it contains the proper values
// for its induced method) by calling the context initialization
// function for the current induced method. (That function assumes
// that the context is pre- initialized with values for native
// execution.)
f( ind, gks_id_ind );
}
}
// END CRITICAL SECTION
// Release the mutex protecting the gks.
bli_pthread_mutex_unlock( &gks_mutex );
// Return the address of the newly-allocated/initialized context.
return gks_id_ind;
}
// -----------------------------------------------------------------------------
void bli_gks_init_ref_cntx
(
cntx_t* cntx
)
{
// Query the architecture id.
arch_t id = bli_arch_query_id();
// Sanity check: verify that the arch_t id is valid.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_valid_arch_id( id );
bli_check_error_code( e_val );
}
// Obtain the function pointer to the context initialization function for
// reference kernels.
ref_cntx_init_ft f = cntx_ref_init[ id ];
// Initialize the caller's context with reference kernels and related values.
f( cntx );
}
// -----------------------------------------------------------------------------
bool bli_gks_cntx_l3_nat_ukr_is_ref
(
num_t dt,
l3ukr_t ukr_id,
cntx_t* cntx
)
{
cntx_t ref_cntx;
// Initialize a context with reference kernels for the arch_t id queried
// via bli_arch_query_id().
bli_gks_init_ref_cntx( &ref_cntx );
// Query each context for the micro-kernel function pointer for the
// specified datatype.
void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, &ref_cntx );
void_fp fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, cntx );
// Return the result.
return fp == ref_fp;
}
//
// -- level-3 micro-kernel implementation strings ------------------------------
//
static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
{
"refrnce",
"virtual",
"optimzd",
"notappl",
};
// -----------------------------------------------------------------------------
char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
{
kimpl_t ki;
// Query the context for the current induced method and datatype, and
// then query the ukernel function pointer for the given datatype from
// that context.
cntx_t* cntx = bli_gks_query_ind_cntx( method, dt );
void_fp fp = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx );
// Check whether the ukernel function pointer is NULL for the given
// datatype. If it is NULL, return the string for not applicable.
// Otherwise, query the ukernel implementation type using the method
// provided and return the associated string.
if ( fp == NULL )
ki = BLIS_NOTAPPLIC_UKERNEL;
else
ki = bli_gks_l3_ukr_impl_type( ukr, method, dt );
return bli_gks_l3_ukr_impl_str[ ki ];
}
#if 0
char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt )
{
opid_t oper;
ind_t method;
kimpl_t ki;
// We need to decide which operation we will use to query the
// current available induced method. If the ukr type given is
// BLIS_GEMM_UKR, we use gemm. Otherwise, we use trsm (since
// the four other defined ukr types are trsm-related).
if ( ukr == BLIS_GEMM_UKR ) oper = BLIS_GEMM;
else oper = BLIS_TRSM;
// Query the current available induced method using the
// chosen operation id type.
method = bli_l3_ind_oper_find_avail( oper, dt );
// Query the ukernel implementation type using the current
// available method.
ki = bli_gks_l3_ukr_impl_type( ukr, method, dt );
return bli_ukr_impl_str[ ki ];
}
#endif
kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
{
// If the current available induced method is not native, it
// must be virtual.
if ( method != BLIS_NAT ) return BLIS_VIRTUAL_UKERNEL;
else
{
// If the current available induced method for the gemm
// operation is native, then it might be reference or
// optimized. To determine which, we compare the
// datatype-specific function pointer within the ukrs
// object corresponding to the current available induced
// method to the typed function pointer within the known
// reference ukrs object.
cntx_t ref_cntx_l;
// Query the architecture id.
arch_t id = bli_arch_query_id();
// Sanity check: verify that the arch_t id is valid.
if ( bli_error_checking_is_enabled() )
{
err_t e_val = bli_check_valid_arch_id( id );
bli_check_error_code( e_val );
}
// Obtain the function pointer to the context initialization function
// for reference kernels.
ref_cntx_init_ft f = cntx_ref_init[ id ];
// Initialize a local context with reference kernels and related values.
f( &ref_cntx_l );
// Query the native context from the gks.
cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
// Query the native ukernel func_t from both the native and reference
// contexts.
void_fp nat_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, nat_cntx );
void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, &ref_cntx_l );
if ( nat_fp == ref_fp ) return BLIS_REFERENCE_UKERNEL;
else return BLIS_OPTIMIZED_UKERNEL;
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_gks.h 0000664 0000000 0000000 00000005201 14634250137 0021414 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_GKS_H
#define BLIS_GKS_H
void bli_gks_init( void );
void bli_gks_finalize( void );
void bli_gks_init_index( void );
cntx_t* bli_gks_lookup_nat_cntx( arch_t id );
cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
cntx_t** bli_gks_lookup_id( arch_t id );
void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void );
BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void );
cntx_t* bli_gks_query_cntx_noinit( void );
BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx );
bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx );
BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt );
BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt );
//char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_ind.c 0000664 0000000 0000000 00000013654 14634250137 0021410 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
{
/* 1m */ "1m",
/* nat */ "native",
};
// -----------------------------------------------------------------------------
void bli_ind_init( void )
{
// NOTE: Instead of calling bli_gks_query_cntx(), we call
// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
cntx_t* cntx = bli_gks_query_cntx_noinit();
// For each precision, enable the default induced method (1m) if both of
// the following conditions are met:
// - the complex domain kernel is the (unoptimized) reference kernel
// - the real domain kernel is NOT the (unoptimized) reference kernel
// The second condition means that BLIS will not bother to use an induced
// method if both the real and complex domain kernels are reference.
bool s_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_FLOAT, BLIS_GEMM_UKR, cntx );
bool d_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_DOUBLE, BLIS_GEMM_UKR, cntx );
bool c_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx );
bool z_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_DCOMPLEX, BLIS_GEMM_UKR, cntx );
if ( c_is_ref && !s_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX );
if ( z_is_ref && !d_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX );
}
void bli_ind_finalize( void )
{
}
// -----------------------------------------------------------------------------
void bli_ind_enable( ind_t method )
{
bli_ind_enable_dt( method, BLIS_SCOMPLEX );
bli_ind_enable_dt( method, BLIS_DCOMPLEX );
}
void bli_ind_disable( ind_t method )
{
bli_ind_disable_dt( method, BLIS_SCOMPLEX );
bli_ind_disable_dt( method, BLIS_DCOMPLEX );
}
void bli_ind_disable_all( void )
{
bli_ind_disable_all_dt( BLIS_SCOMPLEX );
bli_ind_disable_all_dt( BLIS_DCOMPLEX );
}
// -----------------------------------------------------------------------------
void bli_ind_enable_dt( ind_t method, num_t dt )
{
if ( !bli_is_complex( dt ) ) return;
bli_l3_ind_set_enable_dt( method, dt, TRUE );
}
void bli_ind_disable_dt( ind_t method, num_t dt )
{
if ( !bli_is_complex( dt ) ) return;
bli_l3_ind_set_enable_dt( method, dt, FALSE );
}
void bli_ind_disable_all_dt( num_t dt )
{
ind_t im;
for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im )
{
// Never disable native execution.
if ( im != BLIS_NAT )
bli_ind_disable_dt( im, dt );
}
}
// -----------------------------------------------------------------------------
void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt )
{
if ( !bli_is_complex( dt ) ) return;
if ( bli_opid_is_level3( oper ) )
{
bli_l3_ind_oper_enable_only( oper, method, dt );
}
else
{
// Other operations are not implemented, so requests to enable
// them for any given induced method are currently no-ops.
;
}
}
// -----------------------------------------------------------------------------
bool bli_ind_oper_is_impl( opid_t oper, ind_t method )
{
bool is_impl = FALSE;
if ( bli_opid_is_level3( oper ) )
{
// Look up whether the operation is implemented for the given induced
// method id.
is_impl = bli_l3_ind_oper_is_impl( oper, method );
}
else
{
// All other operations should be reported as not implemented,
// unless the requested check was for BLIS_NAT, in which case
// all operations are implemented.
if ( method == BLIS_NAT ) is_impl = TRUE;
else is_impl = FALSE;
}
return is_impl;
}
ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt )
{
ind_t method;
if ( bli_opid_is_level3( oper ) )
{
method = bli_l3_ind_oper_find_avail( oper, dt );
}
else
{
// Currently, any operation that is not level-3 is guaranteed
// to be native.
method = BLIS_NAT;
}
return method;
}
char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
{
ind_t method = bli_ind_oper_find_avail( oper, dt );
return bli_ind_get_impl_string( method );
}
// -----------------------------------------------------------------------------
char* bli_ind_get_impl_string( ind_t method )
{
return bli_ind_impl_str[ method ];
}
num_t bli_ind_map_cdt_to_index( num_t dt )
{
// A non-complex datatype should never be passed in.
if ( !bli_is_complex( dt ) ) bli_abort();
// Map the complex datatype to a zero-based index.
if ( bli_is_scomplex( dt ) ) return 0;
else /* if ( bli_is_dcomplex( dt ) ) */ return 1;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_ind.h 0000664 0000000 0000000 00000005122 14634250137 0021404 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_IND_H
#define BLIS_IND_H
// level-3 induced method management
#include "bli_l3_ind.h"
void bli_ind_init( void );
void bli_ind_finalize( void );
BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method );
BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method );
BLIS_EXPORT_BLIS void bli_ind_disable_all( void );
BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt );
BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt );
BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt );
BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method );
BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt );
BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt );
char* bli_ind_get_impl_string( ind_t method );
num_t bli_ind_map_cdt_to_index( num_t dt );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_info.c 0000664 0000000 0000000 00000017050 14634250137 0021563 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -- General library information ----------------------------------------------
// This string gets defined via -D on the command line when BLIS is compiled.
// This string is (or rather, should be) only used here.
static char* bli_version_str = BLIS_VERSION_STRING;
static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
char* bli_info_get_version_str( void ) { return bli_version_str; }
char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; }
// -- General configuration-related --------------------------------------------
gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; }
gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; }
gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; }
gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; }
gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_MAX_NUM_REGISTERS; }
gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_MAX_SIZE; }
gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; }
gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; }
gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; }
gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; }
gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; }
gint_t bli_info_get_pool_addr_align_size_a( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_A; }
gint_t bli_info_get_pool_addr_align_size_b( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_B; }
gint_t bli_info_get_pool_addr_align_size_c( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_C; }
gint_t bli_info_get_pool_addr_align_size_gen( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_GEN; }
gint_t bli_info_get_pool_addr_offset_size_a( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_A; }
gint_t bli_info_get_pool_addr_offset_size_b( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_B; }
gint_t bli_info_get_pool_addr_offset_size_c( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_C; }
gint_t bli_info_get_pool_addr_offset_size_gen( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_GEN; }
gint_t bli_info_get_enable_blas( void )
{
#ifdef BLIS_ENABLE_BLAS
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_enable_cblas( void )
{
#ifdef BLIS_ENABLE_CBLAS
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_blas_int_type_size( void ) { return BLIS_BLAS_INT_TYPE_SIZE; }
gint_t bli_info_get_enable_pba_pools( void )
{
#ifdef BLIS_ENABLE_PBA_POOLS
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_enable_sba_pools( void )
{
#ifdef BLIS_ENABLE_SBA_POOLS
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_enable_threading( void )
{
if ( bli_info_get_enable_openmp() ||
bli_info_get_enable_pthreads() ) return 1;
else return 0;
}
gint_t bli_info_get_enable_openmp( void )
{
#ifdef BLIS_ENABLE_OPENMP
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_enable_pthreads( void )
{
#ifdef BLIS_ENABLE_PTHREADS
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_thread_part_jrir_slab( void )
{
#ifdef BLIS_ENABLE_JRIR_SLAB
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_thread_part_jrir_rr( void )
{
#ifdef BLIS_ENABLE_JRIR_RR
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_enable_memkind( void )
{
#ifdef BLIS_ENABLE_MEMKIND
return 1;
#else
return 0;
#endif
}
gint_t bli_info_get_enable_sandbox( void )
{
#ifdef BLIS_ENABLE_SANDBOX
return 1;
#else
return 0;
#endif
}
// -- Kernel implementation-related --------------------------------------------
// -- Level-3 kernel definitions --
char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
{ bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMM_UKR, method, dt ); }
char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
{ bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_L_UKR, method, dt ); }
char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
{ bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_U_UKR, method, dt ); }
char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
{ bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_L_UKR, method, dt ); }
char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
{ bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_U_UKR, method, dt ); }
// -- BLIS implementation query (level-3) --------------------------------------
char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); }
char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); }
char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); }
char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); }
char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); }
cython-blis-1.0.0/blis/_src/frame/base/bli_info.h 0000664 0000000 0000000 00000012517 14634250137 0021573 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- General library information ----------------------------------------------
BLIS_EXPORT_BLIS char* bli_info_get_version_str( void );
BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void );
// -- General configuration-related --------------------------------------------
BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void );
BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void );
// -- Kernel implementation-related --------------------------------------------
// -- Level-3 kernel definitions --
BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
// -- BLIS implementation query (level-3) --------------------------------------
BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt );
BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt );
cython-blis-1.0.0/blis/_src/frame/base/bli_init.c 0000664 0000000 0000000 00000010534 14634250137 0021573 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -----------------------------------------------------------------------------
void bli_init( void )
{
bli_init_once();
}
void bli_finalize( void )
{
bli_finalize_once();
}
// -----------------------------------------------------------------------------
void bli_init_auto( void )
{
bli_init_once();
}
void bli_finalize_auto( void )
{
// The _auto() functions are used when initializing the BLAS compatibility
// layer. It would not make much sense to automatically initialize and
// finalize for every BLAS routine call; therefore, we remain initialized
// unless and until the application explicitly calls bli_finalize().
}
// -----------------------------------------------------------------------------
// A pthread_once_t variable is a pthread structure used in pthread_once().
// pthread_once() is guaranteed to execute exactly once among all threads that
// pass in this control object (until/unless the variable is reset).
static bli_pthread_once_t once_init = BLIS_PTHREAD_ONCE_INIT;
static bli_pthread_once_t once_finalize = BLIS_PTHREAD_ONCE_INIT;
void bli_init_once( void )
{
bli_pthread_once( &once_init, bli_init_apis );
}
void bli_finalize_once( void )
{
bli_pthread_once( &once_finalize, bli_finalize_apis );
}
// -----------------------------------------------------------------------------
void bli_init_apis( void )
{
// Initialize various sub-APIs.
bli_gks_init();
bli_ind_init();
bli_thread_init();
bli_pack_init();
bli_memsys_init();
// Reset the control variable that will allow finalization.
// NOTE: We must initialize a fresh pthread_once_t object and THEN copy the
// contents to the static control variable because some implementations of
// pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as
// a struct initializer expression (i.e. { ... }), which cannot be used in
// post-declaration struct assignment in strict C99.
const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT;
once_finalize = once_new;
}
void bli_finalize_apis( void )
{
// Finalize various sub-APIs.
bli_memsys_finalize();
bli_pack_finalize();
bli_thread_finalize();
bli_ind_finalize();
bli_gks_finalize();
// Reset the control variable that will allow (re-)initialization.
// NOTE: We must initialize a fresh pthread_once_t object and THEN copy the
// contents to the static control variable because some implementations of
// pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as
// a struct initializer expression (i.e. { ... }), which cannot be used in
// post-declaration struct assignment in strict C99.
const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT;
once_init = once_new;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_init.h 0000664 0000000 0000000 00000003625 14634250137 0021603 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS void bli_init( void );
BLIS_EXPORT_BLIS void bli_finalize( void );
void bli_init_auto( void );
void bli_finalize_auto( void );
void bli_init_apis( void );
void bli_finalize_apis( void );
void bli_init_once( void );
void bli_finalize_once( void );
cython-blis-1.0.0/blis/_src/frame/base/bli_machval.c 0000664 0000000 0000000 00000006735 14634250137 0022253 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#define FUNCPTR_T machval_fp
typedef void (*FUNCPTR_T)(
machval_t mval,
void* v
);
static FUNCPTR_T GENARRAY(ftypes,machval);
//
// Define object-based interface.
//
void bli_machval( machval_t mval,
obj_t* v )
{
num_t dt_v = bli_obj_dt( v );
void* buf_v = bli_obj_buffer_at_off( v );
FUNCPTR_T f;
// Index into the function pointer array.
f = ftypes[dt_v];
// Invoke the function.
f( mval,
buf_v );
}
//
// Define BLAS-like interfaces.
//
#undef GENTFUNCR
#define GENTFUNCR( ctype_v, ctype_vr, chv, chvr, opname, varname ) \
\
void PASTEMAC(chv,opname) \
( \
machval_t mval, \
void* v \
) \
{ \
static ctype_vr pvals[ BLIS_NUM_MACH_PARAMS ]; \
\
static bool first_time = TRUE; \
\
dim_t val_i = mval - BLIS_MACH_PARAM_FIRST; \
ctype_v* v_cast = v; \
\
/* If this is the first time through, call the underlying
code to discover each machine parameter. */ \
if ( first_time ) \
{ \
char lapack_mval; \
dim_t m, i; \
\
for( i = 0, m = BLIS_MACH_PARAM_FIRST; \
i < BLIS_NUM_MACH_PARAMS - 1; \
++i, ++m ) \
{ \
bli_param_map_blis_to_netlib_machval( m, &lapack_mval ); \
\
/*printf( "bli_machval: querying %u %c\n", m, lapack_mval );*/ \
\
pvals[i] = PASTEMAC(chvr,varname)( &lapack_mval, 1 ); \
\
/*printf( "bli_machval: got back %34.29e\n", pvals[i] ); */ \
} \
\
/* Store epsilon^2 in the last element. */ \
pvals[i] = pvals[0] * pvals[0]; \
\
first_time = FALSE; \
} \
\
/* Copy the requested parameter value to the output buffer, which
may involve a demotion from the complex to real domain. */ \
PASTEMAC2(chvr,chv,copys)( pvals[ val_i ], *v_cast ); \
}
INSERT_GENTFUNCR_BASIC( machval, lamch )
cython-blis-1.0.0/blis/_src/frame/base/bli_machval.h 0000664 0000000 0000000 00000004113 14634250137 0022244 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_lsame.h"
#include "bli_slamch.h"
#include "bli_dlamch.h"
//
// Prototype object-based interface.
//
BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v );
//
// Prototype BLAS-like interfaces.
//
#undef GENTPROTR
#define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \
( \
machval_t mval, \
void* v \
);
INSERT_GENTPROTR_BASIC0( machval )
cython-blis-1.0.0/blis/_src/frame/base/bli_malloc.c 0000664 0000000 0000000 00000016761 14634250137 0022107 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//#define BLIS_ENABLE_MEM_TRACING
// -----------------------------------------------------------------------------
// NOTE: These functions are no longer used. Instead, the relevant sections
// of code call bli_fmalloc_align() and pass in the desired malloc()-like
// function, such as BLIS_MALLOC_POOL.
#if 0
void* bli_malloc_pool( size_t size )
{
const malloc_ft malloc_fp = BLIS_MALLOC_POOL;
const size_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_malloc_pool(): size %ld, align size %ld\n",
( long )size, ( long )align_size );
fflush( stdout );
#endif
return bli_fmalloc_align( malloc_fp, size, align_size );
}
void bli_free_pool( void* p )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_free_pool(): freeing block\n" );
fflush( stdout );
#endif
bli_ffree_align( BLIS_FREE_POOL, p );
}
#endif
// -----------------------------------------------------------------------------
void* bli_malloc_user( size_t size, err_t* r_val )
{
const malloc_ft malloc_fp = BLIS_MALLOC_USER;
const size_t align_size = BLIS_HEAP_ADDR_ALIGN_SIZE;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_malloc_user(): size %ld, align size %ld\n",
( long )size, ( long )align_size );
fflush( stdout );
#endif
void* p = bli_fmalloc_align( malloc_fp, size, align_size, r_val );
return p;
}
void bli_free_user( void* p )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_free_user(): freeing block\n" );
fflush( stdout );
#endif
bli_ffree_align( BLIS_FREE_USER, p );
}
// -----------------------------------------------------------------------------
void* bli_malloc_intl( size_t size, err_t* r_val )
{
const malloc_ft malloc_fp = BLIS_MALLOC_INTL;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_malloc_intl(): size %ld\n", ( long )size );
fflush( stdout );
#endif
void* p = bli_fmalloc_noalign( malloc_fp, size, r_val );
return p;
}
void* bli_calloc_intl( size_t size, err_t* r_val )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_calloc_intl(): " );
#endif
void* p = bli_malloc_intl( size, r_val );
if ( bli_is_success( *r_val ) )
memset( p, 0, size );
return p;
}
void bli_free_intl( void* p )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_free_intl(): freeing block\n" );
fflush( stdout );
#endif
bli_ffree_noalign( BLIS_FREE_INTL, p );
}
// -----------------------------------------------------------------------------
void* bli_fmalloc_align
(
malloc_ft f,
size_t size,
size_t align_size,
err_t* r_val
)
{
const size_t ptr_size = sizeof( void* );
size_t align_offset = 0;
void* p_orig;
int8_t* p_byte;
void** p_addr;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_fmalloc_align_check( f, size, align_size );
// Return early if zero bytes were requested.
if ( size == 0 ) return NULL;
// Add the alignment size and the size of a pointer to the number
// of bytes to allocate.
size += align_size + ptr_size;
// Call the allocation function.
p_orig = f( size );
// Check the pointer returned by malloc().
if ( bli_error_checking_is_enabled() )
bli_fmalloc_post_check( p_orig );
// The pseudo-return value isn't used yet.
*r_val = BLIS_SUCCESS;
// Advance the pointer by one pointer element.
p_byte = p_orig;
p_byte += ptr_size;
// Compute the offset to the desired alignment.
if ( bli_is_unaligned_to( ( siz_t )p_byte, ( siz_t )align_size ) )
{
align_offset = align_size -
bli_offset_past_alignment( ( siz_t )p_byte,
( siz_t )align_size );
}
// Advance the pointer using the difference between the alignment
// size and the alignment offset.
p_byte += align_offset;
// Compute the address of the pointer element just before the start
// of the aligned address, and store the original address there.
p_addr = ( void** )(p_byte - ptr_size);
*p_addr = p_orig;
// Return the aligned pointer.
return p_byte;
}
void bli_ffree_align
(
free_ft f,
void* p
)
{
const size_t ptr_size = sizeof( void* );
void* p_orig;
int8_t* p_byte;
void** p_addr;
// If the pointer to free is NULL, it was obviously not aligned and
// does not need to be freed.
if ( p == NULL ) return;
// Since the bli_fmalloc_align() function returned the aligned pointer,
// we have to first recover the original pointer before we can free the
// memory.
// Start by casting the pointer to a byte pointer.
p_byte = p;
// Compute the address of the pointer element just before the start
// of the aligned address, and recover the original address.
p_addr = ( void** )( p_byte - ptr_size );
p_orig = *p_addr;
// Free the original pointer.
f( p_orig );
}
// -----------------------------------------------------------------------------
void* bli_fmalloc_noalign
(
malloc_ft f,
size_t size,
err_t* r_val
)
{
void* p = f( size );
// Check the pointer returned by malloc().
if ( bli_error_checking_is_enabled() )
bli_fmalloc_post_check( p );
// The pseudo-return value isn't used yet.
*r_val = BLIS_SUCCESS;
return p;
}
void bli_ffree_noalign
(
free_ft f,
void* p
)
{
f( p );
}
// -----------------------------------------------------------------------------
void bli_fmalloc_align_check
(
malloc_ft f,
size_t size,
size_t align_size
)
{
err_t e_val;
// Check for valid alignment.
e_val = bli_check_alignment_is_power_of_two( align_size );
bli_check_error_code( e_val );
e_val = bli_check_alignment_is_mult_of_ptr_size( align_size );
bli_check_error_code( e_val );
}
void bli_fmalloc_post_check
(
void* p
)
{
err_t e_val;
// Check for valid values from malloc().
e_val = bli_check_valid_malloc_buf( p );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_malloc.h 0000664 0000000 0000000 00000005364 14634250137 0022111 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Typedef function pointer types for malloc() and free() substitutes.
//typedef void* (*malloc_ft) ( size_t size );
//typedef void (*free_ft) ( void* p );
// -----------------------------------------------------------------------------
#if 0
BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size );
BLIS_EXPORT_BLIS void bli_free_pool( void* p );
#endif
void* bli_malloc_intl( size_t size, err_t* r_val );
void* bli_calloc_intl( size_t size, err_t* r_val );
void bli_free_intl( void* p );
BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val );
BLIS_EXPORT_BLIS void bli_free_user( void* p );
// -----------------------------------------------------------------------------
void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val );
void bli_ffree_align( free_ft f, void* p );
void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val );
void bli_ffree_noalign( free_ft f, void* p );
void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size );
void bli_fmalloc_post_check( void* p );
cython-blis-1.0.0/blis/_src/frame/base/bli_mbool.c 0000664 0000000 0000000 00000004453 14634250137 0021743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
mbool_t* bli_mbool_create
(
bool b_s,
bool b_d,
bool b_c,
bool b_z
)
{
mbool_t* b;
err_t r_val;
b = ( mbool_t* ) bli_malloc_intl( sizeof( mbool_t ), &r_val );
bli_mbool_init
(
b,
b_s,
b_d,
b_c,
b_z
);
return b;
}
void bli_mbool_init
(
mbool_t* b,
bool b_s,
bool b_d,
bool b_c,
bool b_z
)
{
bli_mbool_set_dt( b_s, BLIS_FLOAT, b );
bli_mbool_set_dt( b_d, BLIS_DOUBLE, b );
bli_mbool_set_dt( b_c, BLIS_SCOMPLEX, b );
bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b );
}
void bli_mbool_free( mbool_t* b )
{
bli_free_intl( b );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_mbool.h 0000664 0000000 0000000 00000004462 14634250137 0021750 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -----------------------------------------------------------------------------
// mbool_t query
BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb )
{
return ( bool )( mb->v[ dt ] );
}
// mbool_t modification
BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb )
{
mb->v[ dt ] = val;
}
// -----------------------------------------------------------------------------
mbool_t* bli_mbool_create
(
bool b_s,
bool b_d,
bool b_c,
bool b_z
);
void bli_mbool_init
(
mbool_t* b,
bool b_s,
bool b_d,
bool b_c,
bool b_z
);
void bli_mbool_free( mbool_t* b );
cython-blis-1.0.0/blis/_src/frame/base/bli_mem.h 0000664 0000000 0000000 00000010252 14634250137 0021410 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MEM_H
#define BLIS_MEM_H
// mem_t object type (defined in bli_type_defs.h)
/*
typedef struct mem_s
{
pblk_t pblk;
packbuf_t buf_type;
pool_t* pool;
siz_t size;
} mem_t;
typedef struct
{
void* buf;
siz_t block_size;
} pblk_t;
*/
//
// -- mem_t query --------------------------------------------------------------
//
BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem )
{
return &(mem->pblk);
}
BLIS_INLINE void* bli_mem_buffer( mem_t* mem )
{
return bli_pblk_buf( bli_mem_pblk( mem ) );
}
BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem )
{
return mem->buf_type;
}
BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem )
{
return mem->pool;
}
BLIS_INLINE siz_t bli_mem_size( mem_t* mem )
{
return mem->size;
}
BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem )
{
return ( bool )
( bli_mem_buffer( mem ) != NULL );
}
BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem )
{
return ( bool )
( bli_mem_buffer( mem ) == NULL );
}
//
// -- mem_t modification -------------------------------------------------------
//
BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem )
{
mem->pblk = *pblk;
}
BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem )
{
bli_pblk_set_buf( buf, &(mem->pblk) );
}
BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem )
{
mem->buf_type = buf_type;
}
BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem )
{
mem->pool = pool;
}
BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem )
{
mem->size = size;
}
//
// -- mem_t initialization -----------------------------------------------------
//
// NOTE: This initializer macro must be updated whenever fields are added or
// removed from the mem_t type definition. An alternative to the initializer is
// calling bli_mem_clear() at runtime.
#define BLIS_MEM_INITIALIZER \
{ \
.pblk = BLIS_PBLK_INITIALIZER, \
.buf_type = -1, \
.pool = NULL, \
.size = 0, \
} \
BLIS_INLINE void bli_mem_clear( mem_t* mem )
{
bli_mem_set_buffer( NULL, mem );
#ifdef __cplusplus
const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE;
// When using C++, which is strongly typed, we avoid use of -1 as a
// packbuf_t value since it will result in a compile-time error.
bli_mem_set_buf_type( pb, mem );
#else
bli_mem_set_buf_type( ( packbuf_t )-1, mem );
#endif
bli_mem_set_pool( NULL, mem );
bli_mem_set_size( 0, mem );
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_memsys.c 0000664 0000000 0000000 00000005115 14634250137 0022144 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_memsys_init( void )
{
// Query a native context so we have something to pass into
// bli_pba_init_pools(). We use BLIS_DOUBLE for the datatype,
// but the dt argument is actually only used when initializing
// contexts for induced methods.
// NOTE: Instead of calling bli_gks_query_cntx(), we call
// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
cntx_t* cntx_p = bli_gks_query_cntx_noinit();
// Initialize the packing block allocator and its data structures.
bli_pba_init( cntx_p );
// Initialize the small block allocator and its data structures.
bli_sba_init();
}
void bli_memsys_finalize( void )
{
// Finalize the small block allocator and its data structures.
bli_sba_finalize();
// Finalize the packing block allocator and its data structures.
bli_pba_finalize();
}
cython-blis-1.0.0/blis/_src/frame/base/bli_memsys.h 0000664 0000000 0000000 00000003716 14634250137 0022156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MEMSYS_H
#define BLIS_MEMSYS_H
// -----------------------------------------------------------------------------
void bli_memsys_init( void );
void bli_memsys_finalize( void );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_obj.c 0000664 0000000 0000000 00000047062 14634250137 0021410 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_obj_create
(
num_t dt,
dim_t m,
dim_t n,
inc_t rs,
inc_t cs,
obj_t* obj
)
{
bli_init_once();
bli_obj_create_without_buffer( dt, m, n, obj );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_obj_create(): " );
#endif
bli_obj_alloc_buffer( rs, cs, 1, obj );
}
void bli_obj_create_with_attached_buffer
(
num_t dt,
dim_t m,
dim_t n,
void* p,
inc_t rs,
inc_t cs,
obj_t* obj
)
{
bli_init_once();
bli_obj_create_without_buffer( dt, m, n, obj );
bli_obj_attach_buffer( p, rs, cs, 1, obj );
}
void bli_obj_create_without_buffer
(
num_t dt,
dim_t m,
dim_t n,
obj_t* obj
)
{
siz_t elem_size;
void* s;
bli_init_once();
if ( bli_error_checking_is_enabled() )
bli_obj_create_without_buffer_check( dt, m, n, obj );
// Query the size of one element of the object's pre-set datatype.
elem_size = bli_dt_size( dt );
// Set any default properties that are appropriate.
bli_obj_set_defaults( obj );
// Set the object root to itself, since obj is not presumed to be a view
// into a larger matrix. This is typically the only time this field is
// ever set; henceforth, subpartitions and aliases to this object will
// get copies of this field, and thus always have access to its
// "greatest-grand" parent (ie: the original parent, or "root", object).
// However, there ARE a few places where it is convenient to reset the
// root field explicitly via bli_obj_set_as_root(). (We do not list
// those places here. Just grep for bli_obj_set_as_root within the
// top-level 'frame' directory to see them.
bli_obj_set_as_root( obj );
// Set individual fields.
bli_obj_set_buffer( NULL, obj );
bli_obj_set_dt( dt, obj );
bli_obj_set_elem_size( elem_size, obj );
bli_obj_set_target_dt( dt, obj );
bli_obj_set_exec_dt( dt, obj );
bli_obj_set_comp_dt( dt, obj );
bli_obj_set_dims( m, n, obj );
bli_obj_set_offs( 0, 0, obj );
bli_obj_set_diag_offset( 0, obj );
bli_obj_set_pack_fn( NULL, obj );
bli_obj_set_pack_params( NULL, obj );
bli_obj_set_ker_fn( NULL, obj );
bli_obj_set_ker_params( NULL, obj );
// Set the internal scalar to 1.0.
bli_obj_set_scalar_dt( dt, obj );
s = bli_obj_internal_scalar_buffer( obj );
// Always writing the imaginary component is needed in mixed-domain
// scenarios. Failing to do this can lead to reading uninitialized
// memory just before calling the macrokernel (as the internal scalars
// for A and B are merged).
//if ( bli_is_float( dt ) ) { bli_sset1s( *(( float* )s) ); }
//else if ( bli_is_double( dt ) ) { bli_dset1s( *(( double* )s) ); }
if ( bli_is_float( dt ) ) { bli_cset1s( *(( scomplex* )s) ); }
else if ( bli_is_double( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); }
else if ( bli_is_scomplex( dt ) ) { bli_cset1s( *(( scomplex* )s) ); }
else if ( bli_is_dcomplex( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); }
}
void bli_obj_alloc_buffer
(
inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj
)
{
dim_t n_elem = 0;
dim_t m, n;
siz_t elem_size;
siz_t buffer_size;
void* p;
err_t r_val;
bli_init_once();
// Query the dimensions of the object we are allocating.
m = bli_obj_length( obj );
n = bli_obj_width( obj );
// Query the size of one element.
elem_size = bli_obj_elem_size( obj );
// Adjust the strides, if needed, before doing anything else
// (particularly, before doing any error checking).
bli_adjust_strides( m, n, elem_size, &rs, &cs, &is );
if ( bli_error_checking_is_enabled() )
bli_obj_alloc_buffer_check( rs, cs, is, obj );
// Determine how much object to allocate.
if ( m == 0 || n == 0 )
{
// For empty objects, set n_elem to zero. Row and column strides
// should remain unchanged (because alignment is not needed).
n_elem = 0;
}
else
{
// The number of elements to allocate is given by the distance from
// the element with the lowest address (usually {0, 0}) to the element
// with the highest address (usually {m-1, n-1}), plus one for the
// highest element itself.
n_elem = (m-1) * bli_abs( rs ) + (n-1) * bli_abs( cs ) + 1;
}
// Handle the special case where imaginary stride is larger than
// normal.
if ( bli_obj_is_complex( obj ) )
{
// Notice that adding is/2 works regardless of whether the
// imaginary stride is unit, something between unit and
// 2*n_elem, or something bigger than 2*n_elem.
n_elem = bli_abs( is ) / 2 + n_elem;
}
// Compute the size of the total buffer to be allocated, which includes
// padding if the leading dimension was increased for alignment purposes.
buffer_size = ( siz_t )n_elem * elem_size;
// Allocate the buffer.
p = bli_malloc_user( buffer_size, &r_val );
// Set individual fields.
bli_obj_set_buffer( p, obj );
bli_obj_set_strides( rs, cs, obj );
bli_obj_set_imag_stride( is, obj );
}
void bli_obj_attach_buffer
(
void* p,
inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj
)
{
bli_init_once();
// Interpret is = 0 as a request for the default, which is is = 1;
if ( is == 0 ) is = 1;
// Check that the strides and lengths are compatible. Note that the
// user *must* specify valid row and column strides when attaching an
// external buffer.
if ( bli_error_checking_is_enabled() )
bli_obj_attach_buffer_check( p, rs, cs, is, obj );
// Update the object.
bli_obj_set_buffer( p, obj );
bli_obj_set_strides( rs, cs, obj );
bli_obj_set_imag_stride( is, obj );
}
void bli_obj_create_1x1
(
num_t dt,
obj_t* obj
)
{
bli_obj_create_without_buffer( dt, 1, 1, obj );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_obj_create_1x1(): " );
#endif
bli_obj_alloc_buffer( 1, 1, 1, obj );
}
void bli_obj_create_1x1_with_attached_buffer
(
num_t dt,
void* p,
obj_t* obj
)
{
bli_obj_create_without_buffer( dt, 1, 1, obj );
bli_obj_attach_buffer( p, 1, 1, 1, obj );
}
void bli_obj_create_conf_to
(
obj_t* s,
obj_t* d
)
{
const num_t dt = bli_obj_dt( s );
const dim_t m = bli_obj_length( s );
const dim_t n = bli_obj_width( s );
const inc_t rs = bli_obj_row_stride( s );
const inc_t cs = bli_obj_col_stride( s );
bli_obj_create( dt, m, n, rs, cs, d );
}
void bli_obj_free
(
obj_t* obj
)
{
if ( bli_error_checking_is_enabled() )
bli_obj_free_check( obj );
// Don't dereference obj if it is NULL.
if ( obj != NULL )
{
// Idiot safety: Don't try to free the buffer field if the object
// is a detached scalar (ie: if the buffer pointer refers to the
// address of the internal scalar buffer).
if ( bli_obj_buffer( obj ) != bli_obj_internal_scalar_buffer( obj ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_obj_free(): " );
#endif
bli_free_user( bli_obj_buffer( obj ) );
}
}
}
#if 0
//void bli_obj_create_const
(
double value,
obj_t* obj
)
{
gint_t* temp_i;
float* temp_s;
double* temp_d;
scomplex* temp_c;
dcomplex* temp_z;
if ( bli_error_checking_is_enabled() )
bli_obj_create_const_check( value, obj );
bli_obj_create( BLIS_CONSTANT, 1, 1, 1, 1, obj );
//temp_s = bli_obj_buffer_for_const( BLIS_FLOAT, obj );
//temp_d = bli_obj_buffer_for_const( BLIS_DOUBLE, obj );
//temp_c = bli_obj_buffer_for_const( BLIS_SCOMPLEX, obj );
//temp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, obj );
//temp_i = bli_obj_buffer_for_const( BLIS_INT, obj );
bli_dssets( value, 0.0, *temp_s );
bli_ddsets( value, 0.0, *temp_d );
bli_dcsets( value, 0.0, *temp_c );
bli_dzsets( value, 0.0, *temp_z );
*temp_i = ( gint_t ) value;
}
//void bli_obj_create_const_copy_of
(
obj_t* a,
obj_t* b
)
{
gint_t* temp_i;
float* temp_s;
double* temp_d;
scomplex* temp_c;
dcomplex* temp_z;
void* buf_a;
dcomplex value;
if ( bli_error_checking_is_enabled() )
bli_obj_create_const_copy_of_check( a, b );
bli_obj_create( BLIS_CONSTANT, 1, 1, 1, 1, b );
//temp_s = bli_obj_buffer_for_const( BLIS_FLOAT, b );
//temp_d = bli_obj_buffer_for_const( BLIS_DOUBLE, b );
//temp_c = bli_obj_buffer_for_const( BLIS_SCOMPLEX, b );
//temp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, b );
//temp_i = bli_obj_buffer_for_const( BLIS_INT, b );
buf_a = bli_obj_buffer_at_off( a );
bli_zzsets( 0.0, 0.0, value );
if ( bli_obj_is_float( a ) )
{
bli_szcopys( *(( float* )buf_a), value );
}
else if ( bli_obj_is_double( a ) )
{
bli_dzcopys( *(( double* )buf_a), value );
}
else if ( bli_obj_is_scomplex( a ) )
{
bli_czcopys( *(( scomplex* )buf_a), value );
}
else if ( bli_obj_is_dcomplex( a ) )
{
bli_zzcopys( *(( dcomplex* )buf_a), value );
}
else
{
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
bli_zscopys( value, *temp_s );
bli_zdcopys( value, *temp_d );
bli_zccopys( value, *temp_c );
bli_zzcopys( value, *temp_z );
*temp_i = ( gint_t ) bli_zreal( value );
}
#endif
void bli_adjust_strides
(
dim_t m,
dim_t n,
siz_t elem_size,
inc_t* rs,
inc_t* cs,
inc_t* is
)
{
// Here, we check the strides that were input from the user and modify
// them if needed.
// Handle the special "empty" case first. If either dimension is zero,
// do nothing (this could represent a zero-length "slice" of another
// matrix).
if ( m == 0 || n == 0 ) return;
// Interpret rs = cs = 0 as request for column storage and -1 as a request
// for row storage.
if ( *rs == 0 && *cs == 0 && ( *is == 0 || *is == 1 ) )
{
// First we handle the 1x1 scalar case explicitly.
if ( m == 1 && n == 1 )
{
*rs = 1;
*cs = 1;
}
// We use column-major storage, except when m == 1, in which case we
// use what amounts to row-major storage because we don't want both
// strides to be unit.
else if ( m == 1 && n > 1 )
{
*rs = n;
*cs = 1;
}
else
{
*rs = 1;
*cs = m;
}
// Use default complex storage.
*is = 1;
// Align the strides depending on the tilt of the matrix. Note that
// scalars are neither row nor column tilted. Also note that alignment
// is only done for rs = cs = 0, and any user-supplied row and column
// strides are preserved.
if ( bli_is_col_tilted( m, n, *rs, *cs ) )
{
*cs = bli_align_dim_to_size( *cs, elem_size,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
}
else if ( bli_is_row_tilted( m, n, *rs, *cs ) )
{
*rs = bli_align_dim_to_size( *rs, elem_size,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
}
}
else if ( *rs == -1 && *cs == -1 && ( *is == 0 || *is == 1 ) )
{
// First we handle the 1x1 scalar case explicitly.
if ( m == 1 && n == 1 )
{
*rs = 1;
*cs = 1;
}
// We use row-major storage, except when n == 1, in which case we
// use what amounts to column-major storage because we don't want both
// strides to be unit.
else if ( n == 1 && m > 1 )
{
*rs = 1;
*cs = m;
}
else
{
*rs = n;
*cs = 1;
}
// Use default complex storage.
*is = 1;
// Align the strides depending on the tilt of the matrix. Note that
// scalars are neither row nor column tilted. Also note that alignment
// is only done for rs = cs = -1, and any user-supplied row and column
// strides are preserved.
if ( bli_is_col_tilted( m, n, *rs, *cs ) )
{
*cs = bli_align_dim_to_size( *cs, elem_size,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
}
else if ( bli_is_row_tilted( m, n, *rs, *cs ) )
{
*rs = bli_align_dim_to_size( *rs, elem_size,
BLIS_HEAP_STRIDE_ALIGN_SIZE );
}
}
else if ( *rs == 1 && *cs == 1 )
{
// If both strides are unit, this is probably a "lazy" request for a
// single vector (but could also be a request for a 1xn matrix in
// column-major order or an mx1 matrix in row-major order). In BLIS,
// we have decided to "reserve" the case where rs = cs = 1 for
// 1x1 scalars only.
if ( m > 1 && n == 1 )
{
// Set the column stride to indicate that this is a column vector
// stored in column-major order. This is done for legacy reasons,
// because we at one time we had to satisify the error checking
// in the underlying BLAS library, which expects the leading
// dimension to be set to at least m, even if it will never be
// used for indexing since it is a vector and thus only has one
// column of data.
*cs = m;
}
else if ( m == 1 && n > 1 )
{
// Set the row stride to indicate that this is a row vector stored
// in row-major order.
*rs = n;
}
// Nothing needs to be done for the 1x1 scalar case where m == n == 1.
}
}
static siz_t dt_sizes[6] =
{
sizeof( float ),
sizeof( scomplex ),
sizeof( double ),
sizeof( dcomplex ),
sizeof( gint_t ),
sizeof( constdata_t )
};
siz_t bli_dt_size
(
num_t dt
)
{
if ( bli_error_checking_is_enabled() )
bli_dt_size_check( dt );
return dt_sizes[dt];
}
static char* dt_names[ BLIS_NUM_FP_TYPES+1 ] =
{
"float",
"scomplex",
"double",
"dcomplex",
"int"
};
char* bli_dt_string
(
num_t dt
)
{
if ( bli_error_checking_is_enabled() )
bli_dt_string_check( dt );
return dt_names[dt];
}
dim_t bli_align_dim_to_mult
(
dim_t dim,
dim_t dim_mult
)
{
// We return the dimension unmodified if the multiple is zero
// (to avoid division by zero).
if ( dim_mult == 0 ) return dim;
dim = ( ( dim + dim_mult - 1 ) /
dim_mult ) *
dim_mult;
return dim;
}
dim_t bli_align_dim_to_size
(
dim_t dim,
siz_t elem_size,
siz_t align_size
)
{
dim = ( ( dim * ( dim_t )elem_size +
( dim_t )align_size - 1
) /
( dim_t )align_size
) *
( dim_t )align_size /
( dim_t )elem_size;
return dim;
}
dim_t bli_align_ptr_to_size
(
void* p,
size_t align_size
)
{
dim_t dim;
dim = ( ( ( uintptr_t )p + align_size - 1 ) /
align_size
) * align_size;
return dim;
}
#if 0
static num_t type_union[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] =
{
// s c d z
/* s */ { BLIS_FLOAT, BLIS_SCOMPLEX, BLIS_DOUBLE, BLIS_DCOMPLEX },
/* c */ { BLIS_SCOMPLEX, BLIS_SCOMPLEX, BLIS_DCOMPLEX, BLIS_DCOMPLEX },
/* d */ { BLIS_DOUBLE, BLIS_DCOMPLEX, BLIS_DOUBLE, BLIS_DCOMPLEX },
/* z */ { BLIS_DCOMPLEX, BLIS_DCOMPLEX, BLIS_DCOMPLEX, BLIS_DCOMPLEX }
};
num_t bli_dt_union( num_t dt1, num_t dt2 )
{
if ( bli_error_checking_is_enabled() )
bli_dt_union_check( dt1, dt2 );
return type_union[dt1][dt2];
}
#endif
void bli_obj_print
(
char* label,
obj_t* obj
)
{
bli_init_once();
FILE* file = stdout;
if ( bli_error_checking_is_enabled() )
bli_obj_print_check( label, obj );
fprintf( file, "\n" );
fprintf( file, "%s\n", label );
fprintf( file, "\n" );
fprintf( file, " m x n %lu x %lu\n", ( unsigned long )bli_obj_length( obj ),
( unsigned long )bli_obj_width( obj ) );
fprintf( file, "\n" );
fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long )bli_obj_row_off( obj ),
( unsigned long )bli_obj_col_off( obj ) );
fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( obj ) );
fprintf( file, "\n" );
fprintf( file, " buf %p\n", ( void* )bli_obj_buffer( obj ) );
fprintf( file, " elem size %lu\n", ( unsigned long )bli_obj_elem_size( obj ) );
fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( obj ),
( signed long int )bli_obj_col_stride( obj ) );
fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( obj ) );
fprintf( file, " m_padded %lu\n", ( unsigned long )bli_obj_padded_length( obj ) );
fprintf( file, " n_padded %lu\n", ( unsigned long )bli_obj_padded_width( obj ) );
fprintf( file, " pd %lu\n", ( unsigned long )bli_obj_panel_dim( obj ) );
fprintf( file, " ps %lu\n", ( unsigned long )bli_obj_panel_stride( obj ) );
fprintf( file, "\n" );
fprintf( file, " info %lX\n", ( unsigned long )(*obj).info );
fprintf( file, " - is complex %lu\n", ( unsigned long )bli_obj_is_complex( obj ) );
fprintf( file, " - is d. prec %lu\n", ( unsigned long )bli_obj_is_double_prec( obj ) );
fprintf( file, " - datatype %lu\n", ( unsigned long )bli_obj_dt( obj ) );
fprintf( file, " - target dt %lu\n", ( unsigned long )bli_obj_target_dt( obj ) );
fprintf( file, " - exec dt %lu\n", ( unsigned long )bli_obj_exec_dt( obj ) );
fprintf( file, " - comp dt %lu\n", ( unsigned long )bli_obj_comp_dt( obj ) );
fprintf( file, " - scalar dt %lu\n", ( unsigned long )bli_obj_scalar_dt( obj ) );
fprintf( file, " - has trans %lu\n", ( unsigned long )bli_obj_has_trans( obj ) );
fprintf( file, " - has conj %lu\n", ( unsigned long )bli_obj_has_conj( obj ) );
fprintf( file, " - unit diag? %lu\n", ( unsigned long )bli_obj_has_unit_diag( obj ) );
fprintf( file, " - struc type %lu\n", ( unsigned long )bli_obj_struc( obj ) >> BLIS_STRUC_SHIFT );
fprintf( file, " - uplo type %lu\n", ( unsigned long )bli_obj_uplo( obj ) >> BLIS_UPLO_SHIFT );
fprintf( file, " - is upper %lu\n", ( unsigned long )bli_obj_is_upper( obj ) );
fprintf( file, " - is lower %lu\n", ( unsigned long )bli_obj_is_lower( obj ) );
fprintf( file, " - is dense %lu\n", ( unsigned long )bli_obj_is_dense( obj ) );
fprintf( file, " - pack schema %lu\n", ( unsigned long )bli_obj_pack_schema( obj ) >> BLIS_PACK_SCHEMA_SHIFT );
fprintf( file, " - packinv diag? %lu\n", ( unsigned long )bli_obj_has_inverted_diag( obj ) );
fprintf( file, " - pack ordifup %lu\n", ( unsigned long )bli_obj_is_pack_rev_if_upper( obj ) );
fprintf( file, " - pack ordiflo %lu\n", ( unsigned long )bli_obj_is_pack_rev_if_lower( obj ) );
fprintf( file, " - packbuf type %lu\n", ( unsigned long )bli_obj_pack_buffer_type( obj ) >> BLIS_PACK_BUFFER_SHIFT );
fprintf( file, "\n" );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_obj.h 0000664 0000000 0000000 00000006764 14634250137 0021421 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_obj_check.h"
BLIS_EXPORT_BLIS void bli_obj_create
(
num_t dt,
dim_t m,
dim_t n,
inc_t rs,
inc_t cs,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer
(
num_t dt,
dim_t m,
dim_t n,
void* p,
inc_t rs,
inc_t cs,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_create_without_buffer
(
num_t dt,
dim_t m,
dim_t n,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_alloc_buffer
(
inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_attach_buffer
(
void* p,
inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_create_1x1
(
num_t dt,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer
(
num_t dt,
void* p,
obj_t* obj
);
BLIS_EXPORT_BLIS void bli_obj_create_conf_to
(
obj_t* s,
obj_t* d
);
BLIS_EXPORT_BLIS void bli_obj_free
(
obj_t* obj
);
void bli_adjust_strides
(
dim_t m,
dim_t n,
siz_t elem_size,
inc_t* rs,
inc_t* cs,
inc_t* is
);
BLIS_EXPORT_BLIS siz_t bli_dt_size
(
num_t dt
);
BLIS_EXPORT_BLIS char* bli_dt_string
(
num_t dt
);
BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult
(
dim_t dim,
dim_t dim_mult
);
BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size
(
dim_t dim,
siz_t elem_size,
siz_t align_size
);
BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size
(
void* p,
size_t align_size
);
BLIS_EXPORT_BLIS void bli_obj_print
(
char* label,
obj_t* obj
);
cython-blis-1.0.0/blis/_src/frame/base/bli_obj_scalar.c 0000664 0000000 0000000 00000016627 14634250137 0022740 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_obj_scalar_init_detached
(
num_t dt,
obj_t* beta
)
{
void* p;
// Initialize beta without a buffer and then attach its internal buffer.
// NOTE: This initializes both the storage datatype and scalar datatype
// bitfields within beta to dt.
bli_obj_create_without_buffer( dt, 1, 1, beta );
// Query the address of the object's internal scalar buffer.
p = bli_obj_internal_scalar_buffer( beta );
// Update the object.
bli_obj_set_buffer( p, beta );
bli_obj_set_strides( 1, 1, beta );
bli_obj_set_imag_stride( 1, beta );
}
void bli_obj_scalar_init_detached_copy_of
(
num_t dt,
conj_t conj,
obj_t* alpha,
obj_t* beta
)
{
obj_t alpha_local;
// Make a local copy of alpha so we can apply the conj parameter.
bli_obj_alias_to( alpha, &alpha_local );
bli_obj_apply_conj( conj, &alpha_local );
// Initialize beta without a buffer and then attach its internal buffer.
bli_obj_scalar_init_detached( dt, beta );
// Copy the scalar value in a to object b, conjugating and/or
// typecasting if needed.
bli_copysc( &alpha_local, beta );
}
void bli_obj_scalar_detach
(
obj_t* a,
obj_t* alpha
)
{
// Use the scalar datatype of A as the storage datatype of the detached
// object alpha.
num_t dt_a = bli_obj_scalar_dt( a );
// Initialize alpha to be a bufferless internal scalar of the same
// datatype as the scalar attached to A.
bli_obj_scalar_init_detached( dt_a, alpha );
// Copy the internal scalar in A to alpha.
// NOTE: This is simply a field-to-field copy with no typecasting. But
// that's okay since bli_obj_scalar_init_detached() initializes the
// storage datatype of alpha to be the same as the datatype of the
// scalar queried from bli_obj_scalar_dt() above.
bli_obj_copy_internal_scalar( a, alpha );
}
void bli_obj_scalar_attach
(
conj_t conj,
obj_t* alpha,
obj_t* a
)
{
obj_t alpha_cast;
// Use the target datatype of A as the datatype to which we cast
// alpha locally.
const num_t dt_targ = bli_obj_target_dt( a );
// Make a copy-cast of alpha to the target datatype of A, queried
// above. This step gives us the opportunity to conjugate and/or
// typecast alpha.
bli_obj_scalar_init_detached_copy_of( dt_targ,
conj,
alpha,
&alpha_cast );
// Copy the internal scalar in alpha_cast to A.
bli_obj_copy_internal_scalar( &alpha_cast, a );
// Update the scalar datatype of A.
bli_obj_set_scalar_dt( dt_targ, a );
}
void bli_obj_scalar_cast_to
(
num_t dt,
obj_t* a
)
{
obj_t alpha;
obj_t alpha_cast;
// Initialize an object alpha to be a bufferless scalar whose
// storage datatype is equal to the scalar datatype of A.
bli_obj_scalar_init_detached( bli_obj_scalar_dt( a ), &alpha );
// Copy the internal scalar in A to alpha.
// NOTE: Since alpha was initialized with the scalar datatype of A,
// a simple field-to-field copy is sufficient (no casting is needed
// here).
bli_obj_copy_internal_scalar( a, &alpha );
// Make a copy-cast of alpha, alpha_cast, with the datatype given by
// the caller. (This is where the typecasting happens.)
bli_obj_scalar_init_detached_copy_of( dt,
BLIS_NO_CONJUGATE,
&alpha,
&alpha_cast );
// Copy the newly-typecasted value in alpha_cast back to A.
bli_obj_copy_internal_scalar( &alpha_cast, a );
// Update the scalar datatype of A to reflect to new datatype used
// in the typecast.
bli_obj_set_scalar_dt( dt, a );
}
void bli_obj_scalar_apply_scalar
(
obj_t* alpha,
obj_t* a
)
{
obj_t alpha_cast;
obj_t scalar_a;
// Make a copy of alpha, alpha_cast, with the same datatype as the
// scalar datatype of A. (This is where the typecasting happens.)
bli_obj_scalar_init_detached_copy_of( bli_obj_scalar_dt( a ),
BLIS_NO_CONJUGATE,
alpha,
&alpha_cast );
// Detach the scalar from A.
bli_obj_scalar_detach( a, &scalar_a );
// Scale the detached scalar by alpha.
bli_mulsc( &alpha_cast, &scalar_a );
// Copy the internal scalar in scalar_a to A.
bli_obj_copy_internal_scalar( &scalar_a, a );
}
void bli_obj_scalar_reset
(
obj_t* a
)
{
num_t dt = bli_obj_scalar_dt( a );
void* scalar_a = bli_obj_internal_scalar_buffer( a );
void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE );
if ( bli_is_float( dt ) ) *(( float* )scalar_a) = *(( float* )one);
else if ( bli_is_double( dt ) ) *(( double* )scalar_a) = *(( double* )one);
else if ( bli_is_scomplex( dt ) ) *(( scomplex* )scalar_a) = *(( scomplex* )one);
else if ( bli_is_dcomplex( dt ) ) *(( dcomplex* )scalar_a) = *(( dcomplex* )one);
// Alternate implementation:
//bli_obj_scalar_attach( BLIS_NO_CONJUGATE, &BLIS_ONE, a );
}
bool bli_obj_scalar_has_nonzero_imag
(
obj_t* a
)
{
bool r_val = FALSE;
num_t dt = bli_obj_scalar_dt( a );
void* scalar_a = bli_obj_internal_scalar_buffer( a );
// FGVZ: Reimplement by using bli_obj_imag_part() and then
// bli_obj_equals( &BLIS_ZERO, ... ).
if ( bli_is_real( dt ) )
{
r_val = FALSE;
}
else if ( bli_is_scomplex( dt ) )
{
r_val = ( bli_cimag( *(( scomplex* )scalar_a) ) != 0.0F );
}
else if ( bli_is_dcomplex( dt ) )
{
r_val = ( bli_zimag( *(( dcomplex* )scalar_a) ) != 0.0 );
}
return r_val;
}
bool bli_obj_scalar_equals
(
obj_t* a,
obj_t* beta
)
{
obj_t scalar_a;
bool r_val;
bli_obj_scalar_detach( a, &scalar_a );
r_val = bli_obj_equals( &scalar_a, beta );
return r_val;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_obj_scalar.h 0000664 0000000 0000000 00000005047 14634250137 0022737 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached
(
num_t dt,
obj_t* beta
);
BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of
(
num_t dt,
conj_t conj,
obj_t* alpha,
obj_t* beta
);
BLIS_EXPORT_BLIS void bli_obj_scalar_detach
(
obj_t* a,
obj_t* alpha
);
BLIS_EXPORT_BLIS void bli_obj_scalar_attach
(
conj_t conj,
obj_t* alpha,
obj_t* a
);
BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to
(
num_t dt,
obj_t* a
);
BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar
(
obj_t* alpha,
obj_t* a
);
BLIS_EXPORT_BLIS void bli_obj_scalar_reset
(
obj_t* a
);
BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag
(
obj_t* a
);
BLIS_EXPORT_BLIS bool bli_obj_scalar_equals
(
obj_t* a,
obj_t* beta
);
cython-blis-1.0.0/blis/_src/frame/base/bli_opid.h 0000664 0000000 0000000 00000003406 14634250137 0021570 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_INLINE bool bli_opid_is_level3( opid_t opid )
{
return ( bool )
( BLIS_GEMM <= opid && opid <= BLIS_TRSM );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_pack.c 0000664 0000000 0000000 00000011667 14634250137 0021556 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// The global rntm_t structure. (The definition resides in bli_rntm.c.)
extern rntm_t global_rntm;
// A mutex to allow synchronous access to global_rntm. (The definition
// resides in bli_rntm.c.)
extern bli_pthread_mutex_t global_rntm_mutex;
// -----------------------------------------------------------------------------
void bli_pack_init( void )
{
// Read the environment variables and use them to initialize the
// global runtime object.
bli_pack_init_rntm_from_env( &global_rntm );
}
void bli_pack_finalize( void )
{
}
// -----------------------------------------------------------------------------
void bli_pack_get_pack_a( bool* pack_a )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
*pack_a = bli_rntm_pack_a( &global_rntm );
}
// -----------------------------------------------------------------------------
void bli_pack_get_pack_b( bool* pack_b )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
*pack_b = bli_rntm_pack_b( &global_rntm );
}
// ----------------------------------------------------------------------------
void bli_pack_set_pack_a( bool pack_a )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
bli_rntm_set_pack_a( pack_a, &global_rntm );
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// ----------------------------------------------------------------------------
void bli_pack_set_pack_b( bool pack_b )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
bli_rntm_set_pack_b( pack_b, &global_rntm );
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// ----------------------------------------------------------------------------
void bli_pack_init_rntm_from_env
(
rntm_t* rntm
)
{
// NOTE: We don't need to acquire the global_rntm_mutex here because this
// function is only called from bli_pack_init(), which is only called
// by bli_init_once().
bool pack_a;
bool pack_b;
#if 1 //def BLIS_ENABLE_SELECTIVE_PACKING
// Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to
// -1 if it is unset.
gint_t pack_a_env = bli_env_get_var( "BLIS_PACK_A", -1 );
gint_t pack_b_env = bli_env_get_var( "BLIS_PACK_B", -1 );
// Enforce the default behavior first, then check for affirmative FALSE, and
// finally assume anything else is TRUE.
if ( pack_a_env == -1 ) pack_a = FALSE; // default behavior
else if ( pack_a_env == 0 ) pack_a = FALSE; // zero is FALSE
else pack_a = TRUE; // anything else is TRUE
if ( pack_b_env == -1 ) pack_b = FALSE; // default behavior
else if ( pack_b_env == 0 ) pack_b = FALSE; // zero is FALSE
else pack_b = TRUE; // anything else is TRUE
#else
pack_a = TRUE;
pack_b = TRUE;
#endif
// Save the results back in the runtime object.
bli_rntm_set_pack_a( pack_a, rntm );
bli_rntm_set_pack_b( pack_b, rntm );
#if 0
printf( "bli_pack_init_rntm_from_env()\n" );
bli_rntm_print( rntm );
#endif
}
cython-blis-1.0.0/blis/_src/frame/base/bli_pack.h 0000664 0000000 0000000 00000004030 14634250137 0021545 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_PACK_H
#define BLIS_PACK_H
void bli_pack_init( void );
void bli_pack_finalize( void );
BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a );
BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b );
BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a );
BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b );
void bli_pack_init_rntm_from_env( rntm_t* rntm );
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_param_map.c 0000664 0000000 0000000 00000017001 14634250137 0022561 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// --- BLIS to BLAS/LAPACK mappings --------------------------------------------
void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side )
{
if ( side == BLIS_LEFT ) *blas_side = 'L';
else if ( side == BLIS_RIGHT ) *blas_side = 'R';
else
{
bli_check_error_code( BLIS_INVALID_SIDE );
}
}
void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo )
{
if ( uplo == BLIS_LOWER ) *blas_uplo = 'L';
else if ( uplo == BLIS_UPPER ) *blas_uplo = 'U';
else
{
bli_check_error_code( BLIS_INVALID_UPLO );
}
}
void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans )
{
if ( trans == BLIS_NO_TRANSPOSE ) *blas_trans = 'N';
else if ( trans == BLIS_TRANSPOSE ) *blas_trans = 'T';
else if ( trans == BLIS_CONJ_TRANSPOSE ) *blas_trans = 'C';
else
{
bli_check_error_code( BLIS_INVALID_TRANS );
}
}
void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag )
{
if ( diag == BLIS_NONUNIT_DIAG ) *blas_diag = 'N';
else if ( diag == BLIS_UNIT_DIAG ) *blas_diag = 'U';
else
{
bli_check_error_code( BLIS_INVALID_DIAG );
}
}
void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval )
{
if ( machval == BLIS_MACH_EPS ) *blas_machval = 'E';
else if ( machval == BLIS_MACH_SFMIN ) *blas_machval = 'S';
else if ( machval == BLIS_MACH_BASE ) *blas_machval = 'B';
else if ( machval == BLIS_MACH_PREC ) *blas_machval = 'P';
else if ( machval == BLIS_MACH_NDIGMANT ) *blas_machval = 'N';
else if ( machval == BLIS_MACH_RND ) *blas_machval = 'R';
else if ( machval == BLIS_MACH_EMIN ) *blas_machval = 'M';
else if ( machval == BLIS_MACH_RMIN ) *blas_machval = 'U';
else if ( machval == BLIS_MACH_EMAX ) *blas_machval = 'L';
else if ( machval == BLIS_MACH_RMAX ) *blas_machval = 'O';
else
{
bli_check_error_code( BLIS_INVALID_MACHVAL );
}
}
// --- BLAS/LAPACK to BLIS mappings --------------------------------------------
// NOTE: These functions were converted into static functions. Please see this
// file's corresponding header for those definitions.
// --- BLIS char to BLIS mappings ----------------------------------------------
void bli_param_map_char_to_blis_side( char side, side_t* blis_side )
{
if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT;
else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT;
else
{
bli_check_error_code( BLIS_INVALID_SIDE );
}
}
void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo )
{
if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER;
else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER;
else if ( uplo == 'e' || uplo == 'E' ) *blis_uplo = BLIS_DENSE;
else
{
bli_check_error_code( BLIS_INVALID_UPLO );
}
}
void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans )
{
if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_NO_TRANSPOSE;
else if ( trans == 'h' || trans == 'H' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
else
{
bli_check_error_code( BLIS_INVALID_TRANS );
}
}
void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj )
{
if ( conj == 'n' || conj == 'N' ) *blis_conj = BLIS_NO_CONJUGATE;
else if ( conj == 'c' || conj == 'C' ) *blis_conj = BLIS_CONJUGATE;
else
{
bli_check_error_code( BLIS_INVALID_CONJ );
}
}
void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag )
{
if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG;
else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG;
else
{
bli_check_error_code( BLIS_INVALID_DIAG );
}
}
void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt )
{
if ( dt == 's' ) *blis_dt = BLIS_FLOAT;
else if ( dt == 'd' ) *blis_dt = BLIS_DOUBLE;
else if ( dt == 'c' ) *blis_dt = BLIS_SCOMPLEX;
else if ( dt == 'z' ) *blis_dt = BLIS_DCOMPLEX;
else if ( dt == 'i' ) *blis_dt = BLIS_INT;
else
{
bli_check_error_code( BLIS_INVALID_DATATYPE );
}
}
// --- BLIS to BLIS char mappings ----------------------------------------------
void bli_param_map_blis_to_char_side( side_t blis_side, char* side )
{
if ( blis_side == BLIS_LEFT ) *side = 'l';
else if ( blis_side == BLIS_RIGHT ) *side = 'r';
else
{
bli_check_error_code( BLIS_INVALID_SIDE );
}
}
void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo )
{
if ( blis_uplo == BLIS_LOWER ) *uplo = 'l';
else if ( blis_uplo == BLIS_UPPER ) *uplo = 'u';
else
{
bli_check_error_code( BLIS_INVALID_UPLO );
}
}
void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans )
{
if ( blis_trans == BLIS_NO_TRANSPOSE ) *trans = 'n';
else if ( blis_trans == BLIS_TRANSPOSE ) *trans = 't';
else if ( blis_trans == BLIS_CONJ_NO_TRANSPOSE ) *trans = 'c';
else if ( blis_trans == BLIS_CONJ_TRANSPOSE ) *trans = 'h';
else
{
bli_check_error_code( BLIS_INVALID_TRANS );
}
}
void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj )
{
if ( blis_conj == BLIS_NO_CONJUGATE ) *conj = 'n';
else if ( blis_conj == BLIS_CONJUGATE ) *conj = 'c';
else
{
bli_check_error_code( BLIS_INVALID_CONJ );
}
}
void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag )
{
if ( blis_diag == BLIS_NONUNIT_DIAG ) *diag = 'n';
else if ( blis_diag == BLIS_UNIT_DIAG ) *diag = 'u';
else
{
bli_check_error_code( BLIS_INVALID_DIAG );
}
}
void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt )
{
if ( blis_dt == BLIS_FLOAT ) *dt = 's';
else if ( blis_dt == BLIS_DOUBLE ) *dt = 'd';
else if ( blis_dt == BLIS_SCOMPLEX ) *dt = 'c';
else if ( blis_dt == BLIS_DCOMPLEX ) *dt = 'z';
else if ( blis_dt == BLIS_INT ) *dt = 'i';
else
{
bli_check_error_code( BLIS_INVALID_DATATYPE );
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_param_map.h 0000664 0000000 0000000 00000013105 14634250137 0022567 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// --- BLIS to BLAS/LAPACK mappings --------------------------------------------
BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval );
// --- BLAS/LAPACK to BLIS mappings --------------------------------------------
// NOTE: These static functions were converted from regular functions in order
// to reduce function call overhead within the BLAS compatibility layer.
BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side )
{
if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT;
else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT;
else
{
// Instead of reporting an error to the framework, default to
// an arbitrary value. This is needed because this function is
// called by the BLAS compatibility layer AFTER it has already
// checked errors and called xerbla(). If the application wants
// to override the BLAS compatibility layer's xerbla--which
// responds to errors with abort()--we need to also NOT call
// abort() here, since either way it has already been dealt
// with.
//bli_check_error_code( BLIS_INVALID_SIDE );
*blis_side = BLIS_LEFT;
}
}
BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo )
{
if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER;
else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_UPLO );
*blis_uplo = BLIS_LOWER;
}
}
BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans )
{
if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_TRANS );
*blis_trans = BLIS_NO_TRANSPOSE;
}
}
BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag )
{
if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG;
else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG;
else
{
// See comment for bli_param_map_netlib_to_blis_side() above.
//bli_check_error_code( BLIS_INVALID_DIAG );
*blis_diag = BLIS_NONUNIT_DIAG;
}
}
// --- BLIS char to BLIS mappings ----------------------------------------------
BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side );
BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo );
BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans );
BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj );
BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag );
BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt );
// --- BLIS to BLIS char mappings ----------------------------------------------
BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag );
BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt );
cython-blis-1.0.0/blis/_src/frame/base/bli_part.c 0000664 0000000 0000000 00000060312 14634250137 0021575 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -- Matrix partitioning ------------------------------------------------------
void bli_acquire_mpart
(
dim_t i,
dim_t j,
dim_t bm,
dim_t bn,
obj_t* parent,
obj_t* child
)
{
// Query the dimensions of the parent object.
const dim_t m_par = bli_obj_length( parent );
const dim_t n_par = bli_obj_width( parent );
// If either i or j is already beyond what exists of the parent matrix,
// slide them back to the outer dimensions. (What will happen in this
// scenario is that bm and bn and/or will be reduced to zero so that the
// child matrix does not refer to anything beyond the bounds of the
// parent. (Note: This is a safety measure and generally should never
// be needed if the caller is passing in sane arguments.)
if ( i > m_par ) i = m_par;
if ( j > n_par ) j = n_par;
// If either bm or bn spills out over the edge of the parent matrix,
// reduce them so that the child matrix fits within the bounds of the
// parent. (Note: This is a safety measure and generally should never
// be needed if the caller is passing in sane arguments, though this
// code is somewhat more likely to be needed than the code above.)
if ( bm > m_par - i ) bm = m_par - i;
if ( bn > n_par - j ) bn = n_par - j;
// Alias the parent object's contents into the child object.
bli_obj_alias_to( parent, child );
// Set the offsets and dimensions of the child object. Note that we
// increment, rather than overwrite, the offsets of the child object
// in case the parent object already had non-zero offsets (usually
// because the parent was itself a child a larger grandparent object).
bli_obj_inc_offs( i, j, child );
bli_obj_set_dims( bm, bn, child );
}
void bli_acquire_mpart_t2b
(
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj );
}
void bli_acquire_mpart_b2t
(
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj );
}
void bli_acquire_mpart_mdim
(
dir_t direct,
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
dim_t m;
dim_t n;
dim_t m_part = 0;
dim_t n_part = 0;
inc_t offm_inc = 0;
inc_t offn_inc = 0;
doff_t diag_off_inc;
// Call a special function for partitioning packed objects. (By only
// catching those objects packed to panels, we omit cases where the
// object is packed to row or column storage, as such objects can be
// partitioned through normally.) Note that the function called below
// assumes forward partitioning.
if ( bli_obj_is_panel_packed( obj ) )
{
bli_packm_acquire_mpart_t2b( req_part, i, b, obj, sub_obj );
return;
}
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_acquire_mpart_t2b_check( req_part, i, b, obj, sub_obj );
// Query the m and n dimensions of the object (accounting for
// transposition, if indicated).
if ( bli_obj_has_notrans( obj ) )
{
m = bli_obj_length( obj );
n = bli_obj_width( obj );
}
else // if ( bli_obj_has_trans( obj ) )
{
m = bli_obj_width( obj );
n = bli_obj_length( obj );
}
// Foolproofing: do not let b exceed what's left of the m dimension at
// row offset i.
if ( b > m - i ) b = m - i;
// NOTE: Most of this function implicitly assumes moving forward.
// When moving backward, we have to relocate i.
if ( direct == BLIS_BWD )
{
// Modify i to account for the fact that we are moving backwards.
i = m - i - b;
}
// Support SUBPART1B (behind SUBPART1) and SUBPART1A (ahead of SUBPART1),
// to refer to subpartitions 0 and 2 when moving forward, and 2 and 0 when
// moving backward.
subpart_t subpart0_alias;
subpart_t subpart2_alias;
if ( direct == BLIS_FWD ) { subpart0_alias = BLIS_SUBPART1B;
subpart2_alias = BLIS_SUBPART1A; }
else { subpart0_alias = BLIS_SUBPART1A;
subpart2_alias = BLIS_SUBPART1B; }
// Compute offset increments and dimensions based on which
// subpartition is being requested, assuming no transposition.
if ( req_part == BLIS_SUBPART0 ||
req_part == subpart0_alias )
{
// A0 (offm,offn) unchanged.
// A0 is i x n.
offm_inc = 0;
offn_inc = 0;
m_part = i;
n_part = n;
}
else if ( req_part == BLIS_SUBPART1AND0 )
{
// A1+A0 (offm,offn) unchanged.
// A1+A0 is (i+b) x n.
offm_inc = 0;
offn_inc = 0;
m_part = i + b;
n_part = n;
}
else if ( req_part == BLIS_SUBPART1 )
{
// A1 (offm,offn) += (i,0).
// A1 is b x n.
offm_inc = i;
offn_inc = 0;
m_part = b;
n_part = n;
}
else if ( req_part == BLIS_SUBPART1AND2 )
{
// A1+A2 (offm,offn) += (i,0).
// A1+A2 is (m-i) x n.
offm_inc = i;
offn_inc = 0;
m_part = m - i;
n_part = n;
}
else if ( req_part == BLIS_SUBPART2 ||
req_part == subpart2_alias )
{
// A2 (offm,offn) += (i+b,0).
// A2 is (m-i-b) x n.
offm_inc = i + b;
offn_inc = 0;
m_part = m - i - b;
n_part = n;
}
// Compute the diagonal offset based on the m and n offsets.
diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
// Begin by copying the info, elem size, buffer, row stride, and column
// stride fields of the parent object. Note that this omits copying view
// information because the new partition will have its own dimensions
// and offsets.
bli_obj_init_subpart_from( obj, sub_obj );
// Modify offsets and dimensions of requested partition based on
// whether it needs to be transposed.
if ( bli_obj_has_notrans( obj ) )
{
bli_obj_set_dims( m_part, n_part, sub_obj );
bli_obj_inc_offs( offm_inc, offn_inc, sub_obj );
bli_obj_inc_diag_offset( diag_off_inc, sub_obj );
}
else // if ( bli_obj_has_trans( obj ) )
{
bli_obj_set_dims( n_part, m_part, sub_obj );
bli_obj_inc_offs( offn_inc, offm_inc, sub_obj );
bli_obj_inc_diag_offset( -diag_off_inc, sub_obj );
}
// If the root matrix is not general (ie: has structure defined by the
// diagonal), and the subpartition does not intersect the root matrix's
// diagonal, then set the subpartition structure to "general"; otherwise
// we let the subpartition inherit the storage structure of its immediate
// parent.
if ( !bli_obj_root_is_general( sub_obj ) &&
bli_obj_is_outside_diag( sub_obj ) )
{
// NOTE: This comment may be out-of-date since we now distinguish
// between uplo properties for the current and root objects...
// Note that we cannot mark the subpartition object as general/dense
// here since it makes sense to preserve the existing uplo information
// a while longer so that the correct kernels are invoked. (Example:
// incremental packing/computing in gemmt produces subpartitions that
// appear general/dense, but their uplo fields are needed to be either
// lower or upper, to determine which macro-kernel gets called in the
// gemmt_int() back-end.)
// If the subpartition lies entirely in an "unstored" triangle of the
// root matrix, then we need to tweak the subpartition. If the root
// matrix is Hermitian or symmetric, then we reflect the partition to
// the other side of the diagonal, toggling the transposition bit (and
// conjugation bit if the root matrix is Hermitian). Or, if the root
// matrix is triangular, the subpartition should be marked as zero.
if ( bli_obj_is_unstored_subpart( sub_obj ) )
{
if ( bli_obj_root_is_hermitian( sub_obj ) )
{
bli_obj_reflect_about_diag( sub_obj );
bli_obj_toggle_conj( sub_obj );
}
else if ( bli_obj_root_is_symmetric( sub_obj ) )
{
bli_obj_reflect_about_diag( sub_obj );
}
else if ( bli_obj_root_is_triangular( sub_obj ) )
{
bli_obj_set_uplo( BLIS_ZEROS, sub_obj );
}
}
}
}
void bli_acquire_mpart_l2r
(
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
}
void bli_acquire_mpart_r2l
(
subpart_t req_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
bli_acquire_mpart_ndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
}
void bli_acquire_mpart_ndim
(
dir_t direct,
subpart_t req_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
dim_t m;
dim_t n;
dim_t m_part = 0;
dim_t n_part = 0;
inc_t offm_inc = 0;
inc_t offn_inc = 0;
doff_t diag_off_inc;
// Call a special function for partitioning packed objects. (By only
// catching those objects packed to panels, we omit cases where the
// object is packed to row or column storage, as such objects can be
// partitioned through normally.) Note that the function called below
// assumes forward partitioning.
if ( bli_obj_is_panel_packed( obj ) )
{
bli_packm_acquire_mpart_l2r( req_part, j, b, obj, sub_obj );
return;
}
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_acquire_mpart_l2r_check( req_part, j, b, obj, sub_obj );
// Query the m and n dimensions of the object (accounting for
// transposition, if indicated).
if ( bli_obj_has_notrans( obj ) )
{
m = bli_obj_length( obj );
n = bli_obj_width( obj );
}
else // if ( bli_obj_has_trans( obj ) )
{
m = bli_obj_width( obj );
n = bli_obj_length( obj );
}
// Foolproofing: do not let b exceed what's left of the n dimension at
// column offset j.
if ( b > n - j ) b = n - j;
// NOTE: Most of this function implicitly assumes moving forward.
// When moving backward, we have to relocate j.
if ( direct == BLIS_BWD )
{
// Modify j to account for the fact that we are moving backwards.
j = n - j - b;
}
// Support SUBPART1B (behind SUBPART1) and SUBPART1A (ahead of SUBPART1),
// to refer to subpartitions 0 and 2 when moving forward, and 2 and 0 when
// moving backward.
subpart_t subpart0_alias;
subpart_t subpart2_alias;
if ( direct == BLIS_FWD ) { subpart0_alias = BLIS_SUBPART1B;
subpart2_alias = BLIS_SUBPART1A; }
else { subpart0_alias = BLIS_SUBPART1A;
subpart2_alias = BLIS_SUBPART1B; }
// Compute offset increments and dimensions based on which
// subpartition is being requested, assuming no transposition.
if ( req_part == BLIS_SUBPART0 ||
req_part == subpart0_alias )
{
// A0 (offm,offn) unchanged.
// A0 is m x j.
offm_inc = 0;
offn_inc = 0;
m_part = m;
n_part = j;
}
else if ( req_part == BLIS_SUBPART1AND0 )
{
// A1+A0 (offm,offn) unchanged.
// A1+A0 is m x (j+b).
offm_inc = 0;
offn_inc = 0;
m_part = m;
n_part = j + b;
}
else if ( req_part == BLIS_SUBPART1 )
{
// A1 (offm,offn) += (0,j).
// A1 is m x b.
offm_inc = 0;
offn_inc = j;
m_part = m;
n_part = b;
}
else if ( req_part == BLIS_SUBPART1AND2 )
{
// A1+A2 (offm,offn) += (0,j).
// A1+A2 is m x (n-j).
offm_inc = 0;
offn_inc = j;
m_part = m;
n_part = n - j;
}
else if ( req_part == BLIS_SUBPART2 ||
req_part == subpart2_alias )
{
// A2 (offm,offn) += (0,j+b).
// A2 is m x (n-j-b).
offm_inc = 0;
offn_inc = j + b;
m_part = m;
n_part = n - j - b;
}
// Compute the diagonal offset based on the m and n offsets.
diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
// Begin by copying the info, elem size, buffer, row stride, and column
// stride fields of the parent object. Note that this omits copying view
// information because the new partition will have its own dimensions
// and offsets.
bli_obj_init_subpart_from( obj, sub_obj );
// Modify offsets and dimensions of requested partition based on
// whether it needs to be transposed.
if ( bli_obj_has_notrans( obj ) )
{
bli_obj_set_dims( m_part, n_part, sub_obj );
bli_obj_inc_offs( offm_inc, offn_inc, sub_obj );
bli_obj_inc_diag_offset( diag_off_inc, sub_obj );
}
else // if ( bli_obj_has_trans( obj ) )
{
bli_obj_set_dims( n_part, m_part, sub_obj );
bli_obj_inc_offs( offn_inc, offm_inc, sub_obj );
bli_obj_inc_diag_offset( -diag_off_inc, sub_obj );
}
// If the root matrix is not general (ie: has structure defined by the
// diagonal), and the subpartition does not intersect the root matrix's
// diagonal, then we might need to modify some of the subpartition's
// properties, depending on its structure type.
if ( !bli_obj_root_is_general( sub_obj ) &&
bli_obj_is_outside_diag( sub_obj ) )
{
// NOTE: This comment may be out-of-date since we now distinguish
// between uplo properties for the current and root objects...
// Note that we cannot mark the subpartition object as general/dense
// here since it makes sense to preserve the existing uplo information
// a while longer so that the correct kernels are invoked. (Example:
// incremental packing/computing in gemmt produces subpartitions that
// appear general/dense, but their uplo fields are needed to be either
// lower or upper, to determine which macro-kernel gets called in the
// gemmt_int() back-end.)
// If the subpartition lies entirely in an "unstored" triangle of the
// root matrix, then we need to tweak the subpartition. If the root
// matrix is Hermitian or symmetric, then we reflect the partition to
// the other side of the diagonal, toggling the transposition bit (and
// conjugation bit if the root matrix is Hermitian). Or, if the root
// matrix is triangular, the subpartition should be marked as zero.
if ( bli_obj_is_unstored_subpart( sub_obj ) )
{
if ( bli_obj_root_is_hermitian( sub_obj ) )
{
bli_obj_reflect_about_diag( sub_obj );
bli_obj_toggle_conj( sub_obj );
}
else if ( bli_obj_root_is_symmetric( sub_obj ) )
{
bli_obj_reflect_about_diag( sub_obj );
}
else if ( bli_obj_root_is_triangular( sub_obj ) )
{
bli_obj_set_uplo( BLIS_ZEROS, sub_obj );
}
}
}
}
void bli_acquire_mpart_tl2br
(
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
bli_acquire_mpart_mndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
}
void bli_acquire_mpart_br2tl
(
subpart_t req_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
bli_acquire_mpart_mndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
}
void bli_acquire_mpart_mndim
(
dir_t direct,
subpart_t req_part,
dim_t ij,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
dim_t m;
dim_t n;
dim_t min_m_n;
dim_t m_part = 0;
dim_t n_part = 0;
inc_t offm_inc = 0;
inc_t offn_inc = 0;
doff_t diag_off_inc;
// Call a special function for partitioning packed objects. (By only
// catching those objects packed to panels, we omit cases where the
// object is packed to row or column storage, as such objects can be
// partitioned through normally.) Note that the function called below
// assumes forward partitioning.
if ( bli_obj_is_panel_packed( obj ) )
{
bli_packm_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj );
return;
}
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_acquire_mpart_tl2br_check( req_part, ij, b, obj, sub_obj );
// Query the m and n dimensions of the object (accounting for
// transposition, if indicated).
if ( bli_obj_has_notrans( obj ) )
{
m = bli_obj_length( obj );
n = bli_obj_width( obj );
}
else // if ( bli_obj_has_trans( obj ) )
{
m = bli_obj_width( obj );
n = bli_obj_length( obj );
}
// Foolproofing: do not let b exceed what's left of min(m,n) at
// row/column offset ij.
min_m_n = bli_min( m, n );
if ( b > min_m_n - ij ) b = min_m_n - ij;
// NOTE: Most of this function implicitly assumes moving forward.
// When moving backward, we have to relocate ij.
if ( direct == BLIS_BWD )
{
// Modify ij to account for the fact that we are moving backwards.
ij = min_m_n - ij - b;
}
// Compute offset increments and dimensions based on which
// subpartition is being requested, assuming no transposition.
// Left column of subpartitions
if ( req_part == BLIS_SUBPART00 )
{
// A00 (offm,offn) unchanged.
// A00 is ij x ij.
offm_inc = 0;
offn_inc = 0;
m_part = ij;
n_part = ij;
}
else if ( req_part == BLIS_SUBPART10 )
{
// A10 (offm,offn) += (ij,0).
// A10 is b x ij.
offm_inc = ij;
offn_inc = 0;
m_part = b;
n_part = ij;
}
else if ( req_part == BLIS_SUBPART20 )
{
// A20 (offm,offn) += (ij+b,0).
// A20 is (m-ij-b) x ij.
offm_inc = ij + b;
offn_inc = 0;
m_part = m - ij - b;
n_part = ij;
}
// Middle column of subpartitions.
else if ( req_part == BLIS_SUBPART01 )
{
// A01 (offm,offn) += (0,ij).
// A01 is ij x b.
offm_inc = 0;
offn_inc = ij;
m_part = ij;
n_part = b;
}
else if ( req_part == BLIS_SUBPART11 )
{
// A11 (offm,offn) += (ij,ij).
// A11 is b x b.
offm_inc = ij;
offn_inc = ij;
m_part = b;
n_part = b;
}
else if ( req_part == BLIS_SUBPART21 )
{
// A21 (offm,offn) += (ij+b,ij).
// A21 is (m-ij-b) x b.
offm_inc = ij + b;
offn_inc = ij;
m_part = m - ij - b;
n_part = b;
}
// Right column of subpartitions.
else if ( req_part == BLIS_SUBPART02 )
{
// A02 (offm,offn) += (0,ij+b).
// A02 is ij x (n-ij-b).
offm_inc = 0;
offn_inc = ij + b;
m_part = ij;
n_part = n - ij - b;
}
else if ( req_part == BLIS_SUBPART12 )
{
// A12 (offm,offn) += (ij,ij+b).
// A12 is b x (n-ij-b).
offm_inc = ij;
offn_inc = ij + b;
m_part = b;
n_part = n - ij - b;
}
else // if ( req_part == BLIS_SUBPART22 )
{
// A22 (offm,offn) += (ij+b,ij+b).
// A22 is (m-ij-b) x (n-ij-b).
offm_inc = ij + b;
offn_inc = ij + b;
m_part = m - ij - b;
n_part = n - ij - b;
}
// Compute the diagonal offset based on the m and n offsets.
diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc;
// Begin by copying the info, elem size, buffer, row stride, and column
// stride fields of the parent object. Note that this omits copying view
// information because the new partition will have its own dimensions
// and offsets.
bli_obj_init_subpart_from( obj, sub_obj );
// Modify offsets and dimensions of requested partition based on
// whether it needs to be transposed.
if ( bli_obj_has_notrans( obj ) )
{
bli_obj_set_dims( m_part, n_part, sub_obj );
bli_obj_inc_offs( offm_inc, offn_inc, sub_obj );
bli_obj_inc_diag_offset( diag_off_inc, sub_obj );
}
else // if ( bli_obj_has_trans( obj ) )
{
bli_obj_set_dims( n_part, m_part, sub_obj );
bli_obj_inc_offs( offn_inc, offm_inc, sub_obj );
bli_obj_inc_diag_offset( -diag_off_inc, sub_obj );
}
// If the root matrix is not general (ie: has structure defined by the
// diagonal), and the subpartition does not intersect the root matrix's
// diagonal, then set the subpartition structure to "general"; otherwise
// we let the subpartition inherit the storage structure of its immediate
// parent.
if ( !bli_obj_root_is_general( sub_obj ) &&
req_part != BLIS_SUBPART00 &&
req_part != BLIS_SUBPART11 &&
req_part != BLIS_SUBPART22 )
{
// FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal
// intersecting subpartitions should inherit their root object's
// uplo field, or it should not. Right now, they DO inherit the
// uplo (because they are not set to BLIS_DENSE when the diagonal
// does not intersect). But the whole point of being able to query
// the root object's properties (e.g. uplo field) was so that we
// COULD mark such subpartitions as dense, to make it easier for
// certain subproblems on those subpartitions--subproblems that
// are agnostic to where the subpartition came from.
// NOTE: This comment may be out-of-date since we now distinguish
// between uplo properties for the current and root objects...
// Note that we cannot mark the subpartition object as general/dense
// here since it makes sense to preserve the existing uplo information
// a while longer so that the correct kernels are invoked. (Example:
// incremental packing/computing in gemmt produces subpartitions that
// appear general/dense, but their uplo fields are needed to be either
// lower or upper, to determine which macro-kernel gets called in the
// gemmt_int() back-end.)
// If the subpartition lies entirely in an "unstored" triangle of the
// root matrix, then we need to tweak the subpartition. If the root
// matrix is Hermitian or symmetric, then we reflect the partition to
// the other side of the diagonal, toggling the transposition bit (and
// conjugation bit if the root matrix is Hermitian). Or, if the root
// matrix is triangular, the subpartition should be marked as zero.
if ( bli_obj_is_unstored_subpart( sub_obj ) )
{
if ( bli_obj_root_is_hermitian( sub_obj ) )
{
bli_obj_reflect_about_diag( sub_obj );
bli_obj_toggle_conj( sub_obj );
}
else if ( bli_obj_root_is_symmetric( sub_obj ) )
{
bli_obj_reflect_about_diag( sub_obj );
}
else if ( bli_obj_root_is_triangular( sub_obj ) )
{
bli_obj_set_uplo( BLIS_ZEROS, sub_obj );
}
}
}
}
// -- Vector partitioning ------------------------------------------------------
void bli_acquire_vpart_f2b
(
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
if ( bli_obj_is_col_vector( obj ) )
bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj );
else // if ( bli_obj_is_row_vector( obj ) )
bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
}
void bli_acquire_vpart_b2f
(
subpart_t req_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj
)
{
if ( bli_obj_is_col_vector( obj ) )
bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj );
else // if ( bli_obj_is_row_vector( obj ) )
bli_acquire_mpart_ndim( BLIS_BWD, req_part, i, b, obj, sub_obj );
}
// -- Scalar acquisition -------------------------------------------------------
void bli_acquire_mij
(
dim_t i,
dim_t j,
obj_t* obj,
obj_t* sub_obj
)
{
obj_t tmp_obj;
bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, j, 1, obj, &tmp_obj );
bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, 1, &tmp_obj, sub_obj );
}
void bli_acquire_vi
(
dim_t i,
obj_t* obj,
obj_t* sub_obj
)
{
if ( bli_obj_is_col_vector( obj ) )
bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, 1, obj, sub_obj );
else // if ( bli_obj_is_row_vector( obj ) )
bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, 1, obj, sub_obj );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_part.h 0000664 0000000 0000000 00000006567 14634250137 0021616 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "bli_part_check.h"
// -- Matrix partitioning ------------------------------------------------------
BLIS_EXPORT_BLIS void bli_acquire_mpart
(
dim_t i,
dim_t j,
dim_t m,
dim_t n,
obj_t* obj,
obj_t* sub_obj
);
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
( \
subpart_t req_part, \
dim_t i, \
dim_t b, \
obj_t* obj, \
obj_t* sub_obj \
);
GENPROT( acquire_mpart_t2b )
GENPROT( acquire_mpart_b2t )
GENPROT( acquire_mpart_l2r )
GENPROT( acquire_mpart_r2l )
GENPROT( acquire_mpart_tl2br )
GENPROT( acquire_mpart_br2tl )
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
( \
dir_t direct, \
subpart_t req_part, \
dim_t i, \
dim_t b, \
obj_t* obj, \
obj_t* sub_obj \
);
GENPROT( acquire_mpart_mdim )
GENPROT( acquire_mpart_ndim )
GENPROT( acquire_mpart_mndim )
// -- Vector partitioning ------------------------------------------------------
#undef GENPROT
#define GENPROT( opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
( \
subpart_t req_part, \
dim_t i, \
dim_t b, \
obj_t* obj, \
obj_t* sub_obj \
);
GENPROT( acquire_vpart_f2b )
GENPROT( acquire_vpart_b2f )
// -- Scalar acquisition -------------------------------------------------------
BLIS_EXPORT_BLIS void bli_acquire_mij
(
dim_t i,
dim_t j,
obj_t* obj,
obj_t* sub_obj
);
BLIS_EXPORT_BLIS void bli_acquire_vi
(
dim_t i,
obj_t* obj,
obj_t* sub_obj
);
cython-blis-1.0.0/blis/_src/frame/base/bli_pba.c 0000664 0000000 0000000 00000042246 14634250137 0021377 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Statically initialize the mutex within the packing block allocator object.
static pba_t pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER };
// -----------------------------------------------------------------------------
pba_t* bli_pba_query( void )
{
return &pba;
}
void bli_pba_init
(
cntx_t* restrict cntx
)
{
pba_t* restrict pba = bli_pba_query();
const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN;
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
free_ft free_fp = BLIS_FREE_POOL;
// These fields are used for general-purpose allocation (ie: buf_type
// equal to BLIS_BUFFER_FOR_GEN_USE) within bli_pba_acquire_m().
bli_pba_set_align_size( align_size, pba );
bli_pba_set_malloc_fp( malloc_fp, pba );
bli_pba_set_free_fp( free_fp, pba );
// The mutex field of pba is initialized statically above. This
// keeps bli_pba_init() simpler and removes the possibility of
// something going wrong during mutex initialization.
#ifdef BLIS_ENABLE_PBA_POOLS
bli_pba_init_pools( cntx, pba );
#endif
}
void bli_pba_finalize
(
void
)
{
pba_t* restrict pba = bli_pba_query();
#ifdef BLIS_ENABLE_PBA_POOLS
bli_pba_finalize_pools( pba );
#endif
// The mutex field of pba is initialized statically above, and
// therefore never destroyed.
bli_pba_set_malloc_fp( NULL, pba );
bli_pba_set_free_fp( NULL, pba );
}
void bli_pba_acquire_m
(
rntm_t* rntm,
siz_t req_size,
packbuf_t buf_type,
mem_t* mem
)
{
pool_t* pool;
pblk_t* pblk;
dim_t pi;
err_t r_val;
// If the internal memory pools for packing block allocator are disabled,
// we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the
// immediate usage of bli_pba_malloc().
#ifndef BLIS_ENABLE_PBA_POOLS
buf_type = BLIS_BUFFER_FOR_GEN_USE;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pba_acquire_m(): bli_fmalloc_align(): size %ld\n",
( long )req_size );
#endif
#endif
// Query the memory broker from the runtime.
pba_t* pba = bli_rntm_pba( rntm );
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
malloc_ft malloc_fp = bli_pba_malloc_fp( pba );
siz_t align_size = bli_pba_align_size( pba );
// For general-use buffer requests, dynamically allocating memory
// is assumed to be sufficient.
void* buf = bli_fmalloc_align( malloc_fp, req_size, align_size, &r_val );
// Initialize the mem_t object with:
// - the address of the memory block,
// - the buffer type (a packbuf_t value),
// - the size of the requested region,
// - the pba_t from which the mem_t entry was acquired.
// NOTE: We initialize the pool field to NULL since this block did not
// come from a memory pool.
bli_mem_set_buffer( buf, mem );
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_pool( NULL, mem );
bli_mem_set_size( req_size, mem );
}
else
{
// This branch handles cases where the memory block needs to come
// from an internal memory pool, in which blocks are allocated once
// and then recycled.
// Map the requested packed buffer type to a zero-based index, which
// we then use to select the corresponding memory pool.
pi = bli_packbuf_index( buf_type );
pool = bli_pba_pool( pi, pba );
// Extract the address of the pblk_t struct within the mem_t.
pblk = bli_mem_pblk( mem );
// Acquire the mutex associated with the pba object.
bli_pba_lock( pba );
// BEGIN CRITICAL SECTION
{
// Checkout a block from the pool. If the pool's blocks are too
// small, it will be reinitialized with blocks large enough to
// accommodate the requested block size. If the pool is exhausted,
// either because it is still empty or because all blocks have
// been checked out already, additional blocks will be allocated
// automatically, as-needed. Note that the addresses are stored
// directly into the mem_t struct since pblk is the address of
// the struct's pblk_t field.
bli_pool_checkout_block( req_size, pblk, pool );
}
// END CRITICAL SECTION
// Release the mutex associated with the pba object.
bli_pba_unlock( pba );
// Query the block_size from the pblk_t. This will be at least
// req_size, perhaps larger.
siz_t block_size = bli_pblk_block_size( pblk );
// Initialize the mem_t object with:
// - the buffer type (a packbuf_t value),
// - the address of the memory pool to which it belongs,
// - the size of the contiguous memory block (NOT the size of the
// requested region),
// - the pba_t from which the mem_t entry was acquired.
// The actual (aligned) address is already stored in the mem_t
// struct's pblk_t field.
bli_mem_set_buf_type( buf_type, mem );
bli_mem_set_pool( pool, mem );
bli_mem_set_size( block_size, mem );
}
}
void bli_pba_release
(
rntm_t* rntm,
mem_t* mem
)
{
packbuf_t buf_type;
pool_t* pool;
pblk_t* pblk;
// Query the memory broker from the runtime.
pba_t* pba = bli_rntm_pba( rntm );
// Extract the buffer type so we know what kind of memory was allocated.
buf_type = bli_mem_buf_type( mem );
#ifndef BLIS_ENABLE_PBA_POOLS
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pba_release(): bli_ffree_align(): size %ld\n",
( long )bli_mem_size( mem ) );
#endif
#endif
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
free_ft free_fp = bli_pba_free_fp( pba );
void* buf = bli_mem_buffer( mem );
// For general-use buffers, we dynamically allocate memory, and so
// here we need to free it.
bli_ffree_align( free_fp, buf );
}
else
{
// Extract the address of the pool from which the memory was
// allocated.
pool = bli_mem_pool( mem );
// Extract the address of the pblk_t struct within the mem_t struct.
pblk = bli_mem_pblk( mem );
// Acquire the mutex associated with the pba object.
bli_pba_lock( pba );
// BEGIN CRITICAL SECTION
{
// Check the block back into the pool.
bli_pool_checkin_block( pblk, pool );
}
// END CRITICAL SECTION
// Release the mutex associated with the pba object.
bli_pba_unlock( pba );
}
// Clear the mem_t object so that it appears unallocated. This clears:
// - the pblk_t struct's fields (ie: the buffer addresses)
// - the pool field
// - the size field
// - the pba field
// NOTE: We do not clear the buf_type field since there is no
// "uninitialized" value for packbuf_t.
bli_mem_clear( mem );
}
#if 0
void bli_pba_acquire_v
(
pba_t* pba,
siz_t req_size,
mem_t* mem
)
{
bli_pba_acquire_m
(
pba,
req_size,
BLIS_BUFFER_FOR_GEN_USE,
mem
);
}
#endif
siz_t bli_pba_pool_size
(
pba_t* pba,
packbuf_t buf_type
)
{
siz_t r_val;
if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
{
// We don't (yet) track the amount of general-purpose
// memory that is currently allocated.
r_val = 0;
}
else
{
dim_t pool_index;
pool_t* pool;
// Acquire the pointer to the pool corresponding to the buf_type
// provided.
pool_index = bli_packbuf_index( buf_type );
pool = bli_pba_pool( pool_index, pba );
// Compute the pool "size" as the product of the block size
// and the number of blocks in the pool.
r_val = bli_pool_block_size( pool ) *
bli_pool_num_blocks( pool );
}
return r_val;
}
// -----------------------------------------------------------------------------
void bli_pba_init_pools
(
cntx_t* cntx,
pba_t* pba
)
{
// Map each of the packbuf_t values to an index starting at zero.
const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = bli_pba_pool( index_a, pba );
pool_t* pool_b = bli_pba_pool( index_b, pba );
pool_t* pool_c = bli_pba_pool( index_c, pba );
// Start with empty pools.
const dim_t num_blocks_a = 0;
const dim_t num_blocks_b = 0;
const dim_t num_blocks_c = 0;
siz_t block_size_a = 0;
siz_t block_size_b = 0;
siz_t block_size_c = 0;
// For blocks of A and panels of B, start off with block_ptrs arrays that
// are of a decent length. For C, we can start off with an empty array.
const dim_t block_ptrs_len_a = 80;
const dim_t block_ptrs_len_b = 80;
const dim_t block_ptrs_len_c = 0;
// Use the address alignment sizes designated (at configure-time) for pools.
const siz_t align_size_a = BLIS_POOL_ADDR_ALIGN_SIZE_A;
const siz_t align_size_b = BLIS_POOL_ADDR_ALIGN_SIZE_B;
const siz_t align_size_c = BLIS_POOL_ADDR_ALIGN_SIZE_C;
// Use the offsets from the above alignments.
const siz_t offset_size_a = BLIS_POOL_ADDR_OFFSET_SIZE_A;
const siz_t offset_size_b = BLIS_POOL_ADDR_OFFSET_SIZE_B;
const siz_t offset_size_c = BLIS_POOL_ADDR_OFFSET_SIZE_C;
// Use the malloc() and free() designated (at configure-time) for pools.
malloc_ft malloc_fp = BLIS_MALLOC_POOL;
free_ft free_fp = BLIS_FREE_POOL;
// Determine the block size for each memory pool.
bli_pba_compute_pool_block_sizes( &block_size_a,
&block_size_b,
&block_size_c,
cntx );
// Initialize the memory pools for A, B, and C.
bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size_a,
offset_size_a, malloc_fp, free_fp, pool_a );
bli_pool_init( num_blocks_b, block_ptrs_len_b, block_size_b, align_size_b,
offset_size_b, malloc_fp, free_fp, pool_b );
bli_pool_init( num_blocks_c, block_ptrs_len_c, block_size_c, align_size_c,
offset_size_c, malloc_fp, free_fp, pool_c );
}
void bli_pba_finalize_pools
(
pba_t* pba
)
{
// Map each of the packbuf_t values to an index starting at zero.
dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK );
dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL );
dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL );
// Alias the pool addresses to convenient identifiers.
pool_t* pool_a = bli_pba_pool( index_a, pba );
pool_t* pool_b = bli_pba_pool( index_b, pba );
pool_t* pool_c = bli_pba_pool( index_c, pba );
// Finalize the memory pools for A, B, and C.
bli_pool_finalize( pool_a );
bli_pool_finalize( pool_b );
bli_pool_finalize( pool_c );
}
// -----------------------------------------------------------------------------
void bli_pba_compute_pool_block_sizes
(
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
)
{
const ind_t im = bli_cntx_method( cntx );
siz_t bs_cand_a = 0;
siz_t bs_cand_b = 0;
siz_t bs_cand_c = 0;
num_t dt;
// Compute pool block sizes for each datatype and find the maximum
// size for each pool. This is done so that new pools do not need
// to be allocated if the user switches datatypes.
for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt )
{
siz_t bs_dt_a;
siz_t bs_dt_b;
siz_t bs_dt_c;
// Avoid considering induced methods for real datatypes.
if ( bli_is_real( dt ) && im != BLIS_NAT ) continue;
bli_pba_compute_pool_block_sizes_dt( dt,
&bs_dt_a,
&bs_dt_b,
&bs_dt_c,
cntx );
bs_cand_a = bli_max( bs_dt_a, bs_cand_a );
bs_cand_b = bli_max( bs_dt_b, bs_cand_b );
bs_cand_c = bli_max( bs_dt_c, bs_cand_c );
}
// Save the results.
*bs_a = bs_cand_a;
*bs_b = bs_cand_b;
*bs_c = bs_cand_c;
}
// -----------------------------------------------------------------------------
void bli_pba_compute_pool_block_sizes_dt
(
num_t dt,
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
)
{
siz_t size_dt = bli_dt_size( dt );
blksz_t* mr;
blksz_t* nr;
blksz_t* mc;
blksz_t* kc;
blksz_t* nc;
dim_t mr_dt;
dim_t nr_dt;
dim_t max_mnr_dt;
dim_t mc_max_dt;
dim_t kc_max_dt;
dim_t nc_max_dt;
dim_t packmr_dt;
dim_t packnr_dt;
dim_t max_packmnr_dt;
dim_t scale_num_dt;
dim_t scale_den_dt;
dim_t pool_mc_dt, left_mc_dt;
dim_t pool_nc_dt, left_nc_dt;
dim_t pool_kc_dt;
//
// Find the larger of the two register blocksizes.
//
// Query the mr and nr blksz_t objects for the given method of
// execution.
mr = bli_cntx_get_blksz( BLIS_MR, cntx );
nr = bli_cntx_get_blksz( BLIS_NR, cntx );
// Extract the mr and nr values specific to the current datatype.
mr_dt = bli_blksz_get_def( dt, mr );
nr_dt = bli_blksz_get_def( dt, nr );
// Find the maximum of mr and nr.
max_mnr_dt = bli_max( mr_dt, nr_dt );
//
// Define local maximum cache blocksizes.
//
// Query the mc, kc, and nc blksz_t objects for native execution.
mc = bli_cntx_get_blksz( BLIS_MC, cntx );
kc = bli_cntx_get_blksz( BLIS_KC, cntx );
nc = bli_cntx_get_blksz( BLIS_NC, cntx );
// Extract the maximum mc, kc, and nc values specific to the current
// datatype.
mc_max_dt = bli_blksz_get_max( dt, mc );
kc_max_dt = bli_blksz_get_max( dt, kc );
nc_max_dt = bli_blksz_get_max( dt, nc );
// Add max(mr,nr) to kc to make room for the nudging of kc at
// runtime to be a multiple of mr or nr for triangular operations
// trmm, trmm3, and trsm.
kc_max_dt += max_mnr_dt;
//
// Compute scaling factors.
//
// Compute integer scaling factors (numerator and denominator) used
// to account for situations when the packing register blocksizes are
// larger than the regular register blocksizes.
// In order to compute the scaling factors, we first have to determine
// whether ( packmr / mr ) is greater than ( packnr / nr ). This is
// needed ONLY because the amount of space allocated for a block of A
// and a panel of B needs to be such that MR and NR can be swapped (ie:
// A is packed with NR and B is packed with MR). This transformation is
// needed for right-side trsm when inducing an algorithm that (a) has
// favorable access patterns for column-stored C and (b) allows the
// macro-kernel to reuse the existing left-side fused gemmtrsm micro-
// kernels. We avoid integer division by cross-multiplying:
//
// ( packmr / mr ) >= ( packnr / nr )
// ( packmr / mr ) * nr >= packnr
// packmr * nr >= packnr * mr
//
// So, if packmr * nr >= packnr * mr, then we will use packmr and mr as
// our scaling factors. Otherwise, we'll use packnr and nr.
packmr_dt = bli_blksz_get_max( dt, mr );
packnr_dt = bli_blksz_get_max( dt, nr );
if ( packmr_dt * nr_dt >=
packnr_dt * mr_dt ) { scale_num_dt = packmr_dt;
scale_den_dt = mr_dt; }
else { scale_num_dt = packnr_dt;
scale_den_dt = nr_dt; }
//
// Compute pool block dimensions.
//
pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt;
left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt;
pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt;
left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt;
pool_kc_dt = ( kc_max_dt );
if ( left_mc_dt > 0 ) pool_mc_dt += 1;
if ( left_nc_dt > 0 ) pool_nc_dt += 1;
//
// Compute pool block sizes
//
// We add an extra micro-panel of space to the block sizes for A and B
// just to be sure any pre-loading performed by the micro-kernel does
// not cause a segmentation fault.
max_packmnr_dt = bli_max( packmr_dt, packnr_dt );
*bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
*bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt;
*bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_pba.h 0000664 0000000 0000000 00000010606 14634250137 0021377 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_MEMBRK_H
#define BLIS_MEMBRK_H
// Packing block allocator (formerly memory broker)
/*
typedef struct pba_s
{
pool_t pools[3];
bli_pthread_mutex_t mutex;
// These fields are used for general-purpose allocation.
siz_t align_size;
malloc_ft malloc_fp;
free_ft free_fp;
} pba_t;
*/
// pba init
//BLIS_INLINE void bli_pba_init_mutex( pba_t* pba )
//{
// bli_pthread_mutex_init( &(pba->mutex), NULL );
//}
//BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba )
//{
// bli_pthread_mutex_destroy( &(pba->mutex) );
//}
// pba query
BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba )
{
return &(pba->pools[ pool_index ]);
}
BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba )
{
return pba->align_size;
}
BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba )
{
return pba->malloc_fp;
}
BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba )
{
return pba->free_fp;
}
// pba modification
BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba )
{
pba->align_size = align_size;
}
BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba )
{
pba->malloc_fp = malloc_fp;
}
BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba )
{
pba->free_fp = free_fp;
}
// pba action
BLIS_INLINE void bli_pba_lock( pba_t* pba )
{
bli_pthread_mutex_lock( &(pba->mutex) );
}
BLIS_INLINE void bli_pba_unlock( pba_t* pba )
{
bli_pthread_mutex_unlock( &(pba->mutex) );
}
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS pba_t* bli_pba_query( void );
void bli_pba_init
(
cntx_t* cntx
);
void bli_pba_finalize
(
void
);
void bli_pba_acquire_m
(
rntm_t* rntm,
siz_t req_size,
packbuf_t buf_type,
mem_t* mem
);
void bli_pba_release
(
rntm_t* rntm,
mem_t* mem
);
BLIS_INLINE void bli_pba_rntm_set_pba
(
rntm_t* rntm
)
{
pba_t* pba = bli_pba_query();
bli_rntm_set_pba( pba, rntm );
}
siz_t bli_pba_pool_size
(
pba_t* pba,
packbuf_t buf_type
);
// ----------------------------------------------------------------------------
void bli_pba_init_pools
(
cntx_t* cntx,
pba_t* pba
);
void bli_pba_finalize_pools
(
pba_t* pba
);
void bli_pba_compute_pool_block_sizes
(
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
);
void bli_pba_compute_pool_block_sizes_dt
(
num_t dt,
siz_t* bs_a,
siz_t* bs_b,
siz_t* bs_c,
cntx_t* cntx
);
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_pool.c 0000664 0000000 0000000 00000053041 14634250137 0021601 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//#define BLIS_ENABLE_MEM_TRACING
void bli_pool_init
(
siz_t num_blocks,
siz_t block_ptrs_len,
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
free_ft free_fp,
pool_t* restrict pool
)
{
err_t r_val;
// Make sure that block_ptrs_len is at least num_blocks.
block_ptrs_len = bli_max( block_ptrs_len, num_blocks );
// Handle the case where block_ptrs_len is zero, we explicitly set it to 1,
// to avoid any malloc() with zero size, whose behavior is not fixed, and
// also to prevent from falling into any further memory corruption bug.
block_ptrs_len = ( block_ptrs_len == 0 ) ? 1 : block_ptrs_len;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_init(): allocating block_ptrs (length %d): ",
( int )block_ptrs_len );
#endif
// Allocate the block_ptrs array.
// FGVZ: Do we want to call malloc_fp() for internal data structures as
// well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g.
pblk_t* restrict block_ptrs
=
bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ), &r_val );
// Allocate and initialize each entry in the block_ptrs array.
for ( dim_t i = 0; i < num_blocks; ++i )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_init(): allocating block %d of size %d (align %d, offset %d).\n",
( int )i, ( int )block_size, ( int )align_size, ( int )offset_size );
fflush( stdout );
#endif
bli_pool_alloc_block
(
block_size,
align_size,
offset_size,
malloc_fp,
&(block_ptrs[i])
);
}
// NOTE: The semantics of top_index approximate a stack, where a "full"
// stack (no blocks checked out) is one where top_index == 0 and an empty
// stack (all blocks checked out) one where top_index == num_blocks.
// (Here, num_blocks tracks the number of blocks currently allocated as
// part of the pool.) This "orientation" of the stack was chosen
// intentionally, in contrast to one where top_index == -1 means the
// stack is empty and top_index = num_blocks - 1 means the stack is
// full. The chosen scheme allows one to conceptualize the stack as a
// number line in which blocks are checked out from lowest to highest,
// and additional blocks are added at the higher end.
// Initialize the pool_t structure.
bli_pool_set_block_ptrs( block_ptrs, pool );
bli_pool_set_block_ptrs_len( block_ptrs_len, pool );
bli_pool_set_top_index( 0, pool );
bli_pool_set_num_blocks( num_blocks, pool );
bli_pool_set_block_size( block_size, pool );
bli_pool_set_align_size( align_size, pool );
bli_pool_set_offset_size( offset_size, pool );
bli_pool_set_malloc_fp( malloc_fp, pool );
bli_pool_set_free_fp( free_fp, pool );
}
void bli_pool_finalize
(
pool_t* restrict pool
)
{
// NOTE: This implementation assumes that either:
// - all blocks have been checked in by all threads, or
// - some subset of blocks have been checked in and the caller
// is bli_pool_reinit().
// Query the block_ptrs array.
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the total number of blocks currently allocated.
const siz_t num_blocks = bli_pool_num_blocks( pool );
// NOTE: This sanity check has been disabled because bli_pool_reinit()
// is currently implemented in terms of bli_pool_finalize() followed by
// bli_pool_init(). If that _reinit() takes place when some blocks are
// checked out, then we would expect top_index != 0, and therefore this
// check is not universally appropriate.
#if 0
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
// Sanity check: The top_index should be zero.
if ( top_index != 0 )
{
printf( "bli_pool_finalize(): final top_index == %d (expected 0); block_size: %d.\n",
( int )top_index, ( int )bli_pool_block_size( pool ) );
printf( "bli_pool_finalize(): Implication: not all blocks were checked back in!\n" );
bli_abort();
}
#endif
// Query the free() function pointer for the pool.
free_ft free_fp = bli_pool_free_fp( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d, offset %d).\n",
( int )num_blocks, ( int )bli_pool_block_size( pool ),
( int )bli_pool_align_size( pool ),
( int )bli_pool_offset_size( pool ) );
fflush( stdout );
#endif
// Query the offset size of the pool.
const siz_t offset_size = bli_pool_offset_size( pool );
// Free the individual blocks currently in the pool.
for ( dim_t i = 0; i < num_blocks; ++i )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_finalize(): block %d: ", ( int )i );
#endif
bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) );
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_finalize(): freeing block_ptrs (length %d): ",
( int )( bli_pool_block_ptrs_len( pool ) ) );
#endif
// Free the block_ptrs array.
bli_free_intl( block_ptrs );
// This explicit clearing of the pool_t struct is not strictly
// necessary and so it has been commented out.
#if 0
// Clear the contents of the pool_t struct.
bli_pool_set_block_ptrs( NULL, pool );
bli_pool_set_block_ptrs_len( 0, pool );
bli_pool_set_num_blocks( 0, pool );
bli_pool_set_top_index( 0, pool );
bli_pool_set_block_size( 0, pool );
bli_pool_set_align_size( 0, pool );
bli_pool_set_offset_size( 0, pool );
#endif
}
void bli_pool_reinit
(
siz_t num_blocks_new,
siz_t block_ptrs_len_new,
siz_t block_size_new,
siz_t align_size_new,
siz_t offset_size_new,
pool_t* restrict pool
)
{
// Preserve the pointers to malloc() and free() provided when the pool
// was first initialized.
malloc_ft malloc_fp = bli_pool_malloc_fp( pool );
free_ft free_fp = bli_pool_free_fp( pool );
// Finalize the pool as it is currently configured. If some blocks
// are still checked out to threads, those blocks are not freed
// here, and instead will be freed when the threads attempt to check
// those blocks back into the pool. (This condition can be detected
// since the block size is encoded into each pblk, which is copied
// upon checkout.)
bli_pool_finalize( pool );
// Reinitialize the pool with the new parameters, in particular,
// the new block size.
bli_pool_init
(
num_blocks_new,
block_ptrs_len_new,
block_size_new,
align_size_new,
offset_size_new,
malloc_fp,
free_fp,
pool
);
}
void bli_pool_checkout_block
(
siz_t req_size,
pblk_t* restrict block,
pool_t* restrict pool
)
{
// If the requested block size is smaller than what the pool was
// initialized with, reinitialize the pool to contain blocks of the
// requested size.
if ( bli_pool_block_size( pool ) < req_size )
{
const siz_t num_blocks_new = bli_pool_num_blocks( pool );
const siz_t block_ptrs_len_new = bli_pool_block_ptrs_len( pool );
const siz_t align_size_new = bli_pool_align_size( pool );
const siz_t offset_size_new = bli_pool_offset_size( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_checkout_block(): old block size %d < req size %d; "
"reiniting.\n",
( int )bli_pool_block_size( pool ), ( int )req_size );
fflush( stdout );
#endif
bli_pool_reinit
(
num_blocks_new,
block_ptrs_len_new,
req_size,
align_size_new,
offset_size_new,
pool
);
}
// If the pool is exhausted, add a block.
if ( bli_pool_is_exhausted( pool ) )
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_checkout_block(): pool is exhausted (block size %d); "
"growing by 1.\n", ( int )bli_pool_block_size( pool ) );
fflush( stdout );
#endif
bli_pool_grow( 1, pool );
}
// At this point, at least one block is guaranteed to be available.
// Query the block_ptrs array.
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_checkout_block(): checking out block %d of size %d "
"(align %d).\n",
( int )top_index, ( int )bli_pool_block_size( pool ),
( int )bli_pool_align_size( pool ) );
fflush( stdout );
#endif
// Copy the pblk_t at top_index to the caller's pblk_t struct.
*block = block_ptrs[ top_index ];
// Notice that we don't actually need to clear the contents of
// block_ptrs[top_index]. It will get overwritten eventually when
// the block is checked back in.
bli_pblk_clear( &block_ptrs[top_index] );
// Increment the pool's top_index.
bli_pool_set_top_index( top_index + 1, pool );
}
void bli_pool_checkin_block
(
pblk_t* restrict block,
pool_t* restrict pool
)
{
// If the pblk_t being checked in was allocated with a different block
// size than is currently in use in the pool, we simply free it and
// return. These "orphaned" blocks are no longer of use because the pool
// has since been reinitialized to a different (larger) block size.
if ( bli_pblk_block_size( block ) != bli_pool_block_size( pool ) )
{
// Query the offset size of the pool.
const siz_t offset_size = bli_pool_offset_size( pool );
// Query the free() function pointer for the pool.
free_ft free_fp = bli_pool_free_fp( pool );
bli_pool_free_block( offset_size, free_fp, block );
return;
}
// Query the block_ptrs array.
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_checkin_block(): checking in block %d of size %d "
"(align %d, offset %d).\n",
( int )top_index - 1, ( int )bli_pool_block_size( pool ),
( int )bli_pool_align_size( pool ),
( int )bli_pool_offset_size( pool ) );
fflush( stdout );
#endif
// Copy the caller's pblk_t struct to the block at top_index - 1.
block_ptrs[ top_index - 1 ] = *block;
// Decrement the pool's top_index.
bli_pool_set_top_index( top_index - 1, pool );
}
void bli_pool_grow
(
siz_t num_blocks_add,
pool_t* restrict pool
)
{
err_t r_val;
// If the requested increase is zero, return early.
if ( num_blocks_add == 0 ) return;
// Query the allocated length of the block_ptrs array and also the
// total number of blocks currently allocated.
const siz_t block_ptrs_len_cur = bli_pool_block_ptrs_len( pool );
const siz_t num_blocks_cur = bli_pool_num_blocks( pool );
// Compute the total number of allocated blocks that will exist
// after we grow the pool.
const siz_t num_blocks_new = num_blocks_cur + num_blocks_add;
// If adding num_blocks_add new blocks will exceed the current capacity
// of the block_ptrs array, we need to first put in place a new (larger)
// array.
if ( block_ptrs_len_cur < num_blocks_new )
{
// To prevent this from happening often, we double the current
// length of the block_ptrs array.
// Sanity: make sure that the block_ptrs_len_new will be at least
// num_blocks_new, in case doubling the block_ptrs_len_cur is not enough.
// Example 1:
// - block_ptrs_len_cur == num_blocks_cur == 0 and num_blocks_add = 1
// - So doubling: 2 * block_ptrs_len_cur = 0, whereas 1 is expected
// Example 2:
// - block_ptrs_len_cur == num_blocks_cur == 10 and num_blocks_add = 30
// - So doubling: 2 * block_ptrs_len_cur = 20, whereas 40 is expected
const siz_t block_ptrs_len_new = bli_max( (2 * block_ptrs_len_cur), num_blocks_new );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_grow(): growing block_ptrs_len (%d -> %d): ",
( int )block_ptrs_len_cur, ( int )block_ptrs_len_new );
#endif
// Query the current block_ptrs array.
pblk_t* restrict block_ptrs_cur = bli_pool_block_ptrs( pool );
// Allocate a new block_ptrs array.
// FGVZ: Do we want to call malloc_fp() for internal data structures as
// well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g.
pblk_t* restrict block_ptrs_new
=
bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
// Copy the contents of the old block_ptrs array to the new/resized
// array. Notice that we can begin with top_index since all entries
// from 0 to top_index-1 have been (and are currently) checked out
// to threads.
for ( dim_t i = top_index; i < num_blocks_cur; ++i )
{
block_ptrs_new[i] = block_ptrs_cur[i];
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_grow(): freeing prev block_ptrs: " );
#endif
// Free the old block_ptrs array.
bli_free_intl( block_ptrs_cur );
// Update the pool_t struct with the new block_ptrs array and
// record its allocated length.
bli_pool_set_block_ptrs( block_ptrs_new, pool );
bli_pool_set_block_ptrs_len( block_ptrs_len_new, pool );
}
// At this point, we are guaranteed to have enough unused elements
// in the block_ptrs array to accommodate an additional num_blocks_add
// blocks.
// Query the current block_ptrs array (which was mabye just resized).
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
// Query the block size and alignment size of the pool.
const siz_t block_size = bli_pool_block_size( pool );
const siz_t align_size = bli_pool_align_size( pool );
const siz_t offset_size = bli_pool_offset_size( pool );
// Query the malloc() function pointer for the pool.
malloc_ft malloc_fp = bli_pool_malloc_fp( pool );
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_grow(): growing pool from (%d -> %d).\n",
( int )num_blocks_cur, ( int )num_blocks_new );
fflush( stdout );
#endif
// Allocate the requested additional blocks in the resized array.
for ( dim_t i = num_blocks_cur; i < num_blocks_new; ++i )
{
bli_pool_alloc_block
(
block_size,
align_size,
offset_size,
malloc_fp,
&(block_ptrs[i])
);
}
// Update the pool_t struct with the new number of allocated blocks.
// Notice that top_index remains unchanged, as do the block_size and
// align_size fields.
bli_pool_set_num_blocks( num_blocks_new, pool );
}
void bli_pool_shrink
(
siz_t num_blocks_sub,
pool_t* restrict pool
)
{
// If the requested decrease is zero, return early.
if ( num_blocks_sub == 0 ) return;
// Query the total number of blocks currently allocated.
const siz_t num_blocks = bli_pool_num_blocks( pool );
// Query the top_index of the pool.
const siz_t top_index = bli_pool_top_index( pool );
// Compute the number of blocks available to be checked out
// (and thus available for removal).
const siz_t num_blocks_avail = num_blocks - top_index;
// If the requested decrease is more than the number of available
// blocks in the pool, only remove the number of blocks actually
// available.
num_blocks_sub = bli_min( num_blocks_sub, num_blocks_avail );
// Query the block_ptrs array.
pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
// Compute the new total number of blocks.
const siz_t num_blocks_new = num_blocks - num_blocks_sub;
// Query the offset size of the pool.
const siz_t offset_size = bli_pool_offset_size( pool );
// Query the free() function pointer for the pool.
free_ft free_fp = bli_pool_free_fp( pool );
// Free the individual blocks.
for ( dim_t i = num_blocks_new; i < num_blocks; ++i )
{
bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) );
}
// Update the pool_t struct.
bli_pool_set_num_blocks( num_blocks_new, pool );
// Note that after shrinking the pool, num_blocks < block_ptrs_len.
// This means the pool can grow again by num_blocks_sub before
// a re-allocation of block_ptrs is triggered.
}
void bli_pool_alloc_block
(
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
pblk_t* restrict block
)
{
err_t r_val;
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d, offset %d)\n",
( int )block_size, ( int )align_size, ( int )offset_size );
fflush( stdout );
#endif
// Allocate the block via the bli_fmalloc_align() wrapper, which performs
// alignment logic and opaquely saves the original pointer so that it can
// be recovered when it's time to free the block. Note that we have to
// add offset_size to the number of bytes requested since we will skip
// that many bytes at the beginning of the allocated memory.
void* restrict buf
=
bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size, &r_val );
#if 0
// NOTE: This code is disabled because it is not needed, since
// bli_fmalloc_align() is guaranteed to return an aligned address.
// Advance the pointer to achieve the necessary alignment, if it is not
// already aligned.
if ( bli_is_unaligned_to( ( siz_t )buf_sys, ( siz_t )align_size ) )
{
// C99's stdint.h guarantees that a void* can be safely cast to a
// uintptr_t and then back to a void*, hence the casting of buf_sys
// and align_size to uintptr_t. buf_align is initially cast to char*
// to allow pointer arithmetic in units of bytes, and then advanced
// to the next nearest alignment boundary, and finally cast back to
// void* before being stored. Notice that the arithmetic works even
// if the alignment value is not a power of two.
buf_align = ( void* )( ( char* )buf_align +
( ( uintptr_t )align_size -
( uintptr_t )buf_sys %
( uintptr_t )align_size )
);
}
#endif
// Advance the pointer by offset_size bytes.
buf = ( void* )( ( char* )buf + offset_size );
// Save the results in the pblk_t structure.
bli_pblk_set_buf( buf, block );
bli_pblk_set_block_size( block_size, block );
}
void bli_pool_free_block
(
siz_t offset_size,
free_ft free_fp,
pblk_t* restrict block
)
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_pool_free_block(): calling ffree_align(): size %d.\n",
( int )bli_pblk_block_size( block ) );
fflush( stdout );
#endif
// Extract the pblk_t buffer, which is the aligned address returned from
// bli_fmalloc_align() when the block was allocated.
void* restrict buf = bli_pblk_buf( block );
// Undo the pointer advancement by offset_size bytes performed previously
// by bli_pool_alloc_block().
buf = ( void* )( ( char* )buf - offset_size );
// Free the block via the bli_ffree_align() wrapper, which recovers the
// original pointer that was returned by the pool's malloc() function when
// the block was allocated.
bli_ffree_align( free_fp, buf );
}
void bli_pool_print
(
pool_t* restrict pool
)
{
pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
siz_t block_ptrs_len = bli_pool_block_ptrs_len( pool );
siz_t top_index = bli_pool_top_index( pool );
siz_t num_blocks = bli_pool_num_blocks( pool );
siz_t block_size = bli_pool_block_size( pool );
siz_t align_size = bli_pool_align_size( pool );
siz_t offset_size = bli_pool_offset_size( pool );
printf( "pool struct ---------------\n" );
printf( " block_ptrs: %p\n", block_ptrs );
printf( " block_ptrs_len: %d\n", ( int )block_ptrs_len );
printf( " top_index: %d\n", ( int )top_index );
printf( " num_blocks: %d\n", ( int )num_blocks );
printf( " block_size: %d\n", ( int )block_size );
printf( " align_size: %d\n", ( int )align_size );
printf( " offset_size: %d\n", ( int )offset_size );
printf( " pblks sys align\n" );
for ( dim_t i = 0; i < num_blocks; ++i )
{
printf( " %d: %p\n", ( int )i, bli_pblk_buf( &block_ptrs[i] ) );
}
}
void bli_pblk_print
(
pblk_t* restrict pblk
)
{
void* buf = bli_pblk_buf( pblk );
printf( "pblk struct ---------------\n" );
printf( " block address (aligned): %p\n", buf );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_pool.h 0000664 0000000 0000000 00000015106 14634250137 0021606 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_POOL_H
#define BLIS_POOL_H
// -- Pool block type --
/*
typedef struct
{
void* buf;
siz_t block_size;
} pblk_t;
*/
// -- Pool type --
/*
typedef struct
{
void* block_ptrs;
siz_t block_ptrs_len;
siz_t top_index;
siz_t num_blocks;
siz_t block_size;
siz_t align_size;
malloc_ft malloc_fp;
free_ft free_fp;
} pool_t;
*/
// Pool block query
BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk )
{
return pblk->buf;
}
BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk )
{
return pblk->block_size;
}
// Pool block modification
BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk )
{
pblk->buf = buf;
}
BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk )
{
pblk->block_size = block_size;
}
//
// -- pool block initialization ------------------------------------------------
//
// NOTE: This initializer macro must be updated whenever fields are added or
// removed from the pblk_t type definition. An alternative to the initializer is
// calling bli_pblk_clear() at runtime.
#define BLIS_PBLK_INITIALIZER \
{ \
.buf = NULL, \
.block_size = 0, \
} \
BLIS_INLINE void bli_pblk_clear( pblk_t* pblk )
{
bli_pblk_set_buf( NULL, pblk );
bli_pblk_set_block_size( 0, pblk );
}
// Pool entry query
BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool )
{
return pool->block_ptrs;
}
BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool )
{
return pool->block_ptrs_len;
}
BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool )
{
return pool->num_blocks;
}
BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool )
{
return pool->block_size;
}
BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool )
{
return pool->align_size;
}
BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool )
{
return pool->offset_size;
}
BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool )
{
return pool->malloc_fp;
}
BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool )
{
return pool->free_fp;
}
BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool )
{
return pool->top_index;
}
BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool )
{
return ( bool )
( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) );
}
// Pool entry modification
BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \
{
pool->block_ptrs = block_ptrs;
}
BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \
{
pool->block_ptrs_len = block_ptrs_len;
}
BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \
{
pool->num_blocks = num_blocks;
}
BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \
{
pool->block_size = block_size;
}
BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \
{
pool->align_size = align_size;
}
BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \
{
pool->offset_size = offset_size;
}
BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \
{
pool->malloc_fp = malloc_fp;
}
BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \
{
pool->free_fp = free_fp;
}
BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \
{
pool->top_index = top_index;
}
// -----------------------------------------------------------------------------
void bli_pool_init
(
siz_t num_blocks,
siz_t block_ptrs_len,
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
free_ft free_fp,
pool_t* restrict pool
);
void bli_pool_finalize
(
pool_t* restrict pool
);
void bli_pool_reinit
(
siz_t num_blocks_new,
siz_t block_ptrs_len_new,
siz_t block_size_new,
siz_t align_size_new,
siz_t offset_size_new,
pool_t* restrict pool
);
void bli_pool_checkout_block
(
siz_t req_size,
pblk_t* restrict block,
pool_t* restrict pool
);
void bli_pool_checkin_block
(
pblk_t* restrict block,
pool_t* restrict pool
);
void bli_pool_grow
(
siz_t num_blocks_add,
pool_t* restrict pool
);
void bli_pool_shrink
(
siz_t num_blocks_sub,
pool_t* restrict pool
);
void bli_pool_alloc_block
(
siz_t block_size,
siz_t align_size,
siz_t offset_size,
malloc_ft malloc_fp,
pblk_t* restrict block
);
void bli_pool_free_block
(
siz_t offset_size,
free_ft free_fp,
pblk_t* restrict block
);
void bli_pool_print
(
pool_t* restrict pool
);
void bli_pblk_print
(
pblk_t* restrict pblk
);
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_prune.c 0000664 0000000 0000000 00000013104 14634250137 0021755 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
obj_t* s, mdim_t mdim_s )
{
// If the primary object is general, it has no structure, and
// therefore, no unreferenced parts.
if ( bli_obj_is_general( p ) ) return;
// If the primary object is BLIS_ZEROS, set the dimensions so that the
// matrix is empty. This is not strictly needed but rather a minor
// optimization, as it would prevent threads that would otherwise get
// subproblems on BLIS_ZEROS operands from calling the macro-kernel,
// because bli_thread_range*() would return empty ranges, which would
// cause the variant's for loop from executing any iterations.
// NOTE: this should only ever execute if the primary object is
// triangular because that is the only structure type with subpartitions
// that can be marked as BLIS_ZEROS.
if ( bli_obj_is_triangular( p ) &&
bli_obj_is_zeros( p ) ) { bli_obj_set_dim( mdim_p, 0, p );
bli_obj_set_dim( mdim_s, 0, s );
return; }
// If the primary object is hermitian, symmetric, or triangular, we
// assume that the unstored region will be unreferenced (otherwise,
// the caller should not be invoking this function on that object).
//if ( bli_obj_is_herm_or_symm( p ) ||
// bli_obj_is_triangular( p ) )
{
doff_t diagoff_p = bli_obj_diag_offset( p );
dim_t m = bli_obj_length( p );
dim_t n = bli_obj_width( p );
uplo_t uplo = bli_obj_uplo( p );
dim_t off_inc = 0;
dim_t q;
// Support implicit transposition on p and s.
if ( bli_obj_has_trans( p ) )
{
bli_reflect_about_diag( &diagoff_p, &uplo, &m, &n );
bli_toggle_dim( &mdim_p );
}
if ( bli_obj_has_trans( s ) )
{
bli_toggle_dim( &mdim_s );
}
// Prune away any zero region of the matrix depending on the
// dimension of the primary object being partitioned and the
// triangle in which it is stored.
if ( bli_obj_is_lower( p ) )
{
if ( bli_is_m_dim( mdim_p ) )
{ bli_prune_unstored_region_top_l( &diagoff_p, &m, &n, &off_inc ); }
else // if ( bli_is_n_dim( mdim_p ) )
{ bli_prune_unstored_region_right_l( &diagoff_p, &m, &n, &off_inc ); }
}
else if ( bli_obj_is_upper( p ) )
{
if ( bli_is_m_dim( mdim_p ) )
{ bli_prune_unstored_region_bottom_u( &diagoff_p, &m, &n, &off_inc ); }
else // if ( bli_is_n_dim( mdim_p ) )
{ bli_prune_unstored_region_left_u( &diagoff_p, &m, &n, &off_inc ); }
}
else if ( bli_obj_is_dense( p ) )
{
// Hermitian, symmetric, and triangular matrices are almost
// never dense, but if one were found to be dense, it would
// have no unreferenced regions to prune.
return;
}
else // if ( bli_obj_is_zeros( p ) )
{
// Sanity check. Hermitian/symmetric matrices should never have
// zero subpartitions.
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
}
// Select the (potentially modified) dimension along which we are
// partitioning.
if ( bli_is_m_dim( mdim_p ) ) q = m;
else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n;
// Update the affected objects in case anything changed. Notice that
// it is okay to update the dimension and diagonal offset fields of
// packed primary objects, as long as we do so in tandem with the
// secondary object to maintain conformality. This just means that
// the "ignore-able" zero region is skipped over here, rather than
// within the macro-kernel.
bli_obj_set_diag_offset( diagoff_p, p );
bli_obj_set_dim( mdim_p, q, p );
bli_obj_set_dim( mdim_s, q, s );
// Only update the affected offset fields if the object in question
// is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will
// compute the wrong address within the macro-kernel object wrapper.
if ( !bli_obj_is_packed( p ) ) { bli_obj_inc_off( mdim_p, off_inc, p ); }
if ( !bli_obj_is_packed( s ) ) { bli_obj_inc_off( mdim_s, off_inc, s ); }
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_prune.h 0000664 0000000 0000000 00000003367 14634250137 0021774 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p,
obj_t* s, mdim_t mdim_s );
cython-blis-1.0.0/blis/_src/frame/base/bli_query.c 0000664 0000000 0000000 00000013450 14634250137 0021775 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
bool bli_obj_equals( obj_t* a, obj_t* b )
{
#if 0
bool r_val = FALSE;
num_t dt_a;
num_t dt_b;
num_t dt;
// The function is not yet implemented for vectors and matrices.
if ( !bli_obj_is_1x1( a ) ||
!bli_obj_is_1x1( b ) )
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
dt_a = bli_obj_dt( a );
dt_b = bli_obj_dt( b );
// If B is BLIS_CONSTANT, then we need to test equality based on the
// datatype of A--this works even if A is also BLIS_CONSTANT. If B
// is a regular non-constant type, then we should use its datatype
// to test equality.
if ( dt_b == BLIS_CONSTANT ) dt = dt_a;
else dt = dt_b;
// Now test equality based on the chosen datatype.
if ( dt == BLIS_CONSTANT )
{
dcomplex* ap_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, a );
dcomplex* bp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, b );
// We only test equality for one datatype (double complex) since
// we expect either all fields within the constant to be equal or
// none to be equal. Therefore, we can just test one of them.
r_val = bli_zeqa( ap_z, bp_z );
}
else
{
void* buf_a = bli_obj_buffer_for_1x1( dt, a );
void* buf_b = bli_obj_buffer_for_1x1( dt, b );
if ( dt == BLIS_FLOAT ) r_val = bli_seqa( buf_a, buf_b );
else if ( dt == BLIS_DOUBLE ) r_val = bli_deqa( buf_a, buf_b );
else if ( dt == BLIS_SCOMPLEX ) r_val = bli_ceqa( buf_a, buf_b );
else if ( dt == BLIS_DCOMPLEX ) r_val = bli_zeqa( buf_a, buf_b );
else if ( dt == BLIS_INT ) r_val = bli_ieqa( buf_a, buf_b );
}
return r_val;
#else
bool r_val;
if ( bli_obj_is_1x1( a ) && bli_obj_is_1x1( b ) )
bli_eqsc( a, b, &r_val );
else if ( bli_obj_is_vector( a ) && bli_obj_is_vector( b ) )
bli_eqv( a, b, &r_val );
else
bli_eqm( a, b, &r_val );
return r_val;
#endif
}
bool bli_obj_imag_equals( obj_t* a, obj_t* b )
{
#if 0
bool r_val = FALSE;
num_t dt_a;
num_t dt_b;
dt_a = bli_obj_dt( a );
dt_b = bli_obj_dt( b );
// The function is not yet implemented for vectors and matrices.
if ( !bli_obj_is_1x1( a ) ||
!bli_obj_is_1x1( b ) ||
bli_is_constant( dt_a ) ||
bli_is_complex( dt_b ) )
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
// Handle the special (trivial) case where a is real, in which
// case all we have to do is test whether b is zero.
if ( bli_is_real( dt_a ) )
{
r_val = bli_obj_equals( &BLIS_ZERO, b );
}
else // if ( bli_is_complex( dt_a ) )
{
num_t dt_a_real = bli_dt_proj_to_real( dt_a );
// Now we compare the imaginary part of a to b. Notice that since
// we are using bli_obj_buffer_for_1x1() to acquire the buffer for
// b, this works regardless of whether b is BLIS_CONSTANT.
if ( dt_a == BLIS_SCOMPLEX )
{
scomplex* ap_c = bli_obj_buffer_at_off( a );
float* bp_c = bli_obj_buffer_for_1x1( dt_a_real, b );
r_val = bli_seq( bli_cimag( *ap_c ), *bp_c );
}
else if ( dt_a == BLIS_DCOMPLEX )
{
dcomplex* ap_z = bli_obj_buffer_at_off( a );
double* bp_z = bli_obj_buffer_for_1x1( dt_a_real, b );
r_val = bli_deq( bli_zimag( *ap_z ), *bp_z );
}
}
#endif
bool r_val = FALSE;
// The function is not yet implemented for vectors and matrices.
if ( !bli_obj_is_1x1( a ) ||
!bli_obj_is_1x1( b ) ||
bli_obj_is_complex( b ) )
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
double a_r, a_i;
double b_r, b_i;
// Get the real and imaginary parts of a and cast them to local doubles.
bli_getsc( a, &a_r, &a_i );
// Get the value of b and cast to a local double. (Note: the imaginary part
// of b is ignored since we know b is real.)
bli_getsc( b, &b_r, &b_i );
// Compare the imaginary part of a to the real part of b.
if ( a_i == b_r ) r_val = TRUE;
return r_val;
}
bool bli_obj_imag_is_zero( obj_t* a )
{
bool r_val = TRUE;
// The function is not yet implemented for vectors and matrices.
if ( !bli_obj_is_1x1( a ) )
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
if ( bli_obj_is_complex( a ) )
{
double a_r, a_i;
// Get the real and imaginary parts and cast them to local doubles.
bli_getsc( a, &a_r, &a_i );
// Compare the imaginary part of a to double-precision zero.
if ( !bli_deq0( a_i ) ) r_val = FALSE;
}
return r_val;
}
cython-blis-1.0.0/blis/_src/frame/base/bli_query.h 0000664 0000000 0000000 00000003500 14634250137 0021775 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b );
BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b );
BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a );
cython-blis-1.0.0/blis/_src/frame/base/bli_rntm.c 0000664 0000000 0000000 00000034600 14634250137 0021610 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// The global rntm_t structure, which holds the global thread settings
// along with a few other key parameters.
rntm_t global_rntm;
// A mutex to allow synchronous access to global_rntm.
bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
// ----------------------------------------------------------------------------
void bli_rntm_init_from_global( rntm_t* rntm )
{
// We must ensure that global_rntm has been initialized.
bli_init_once();
// Acquire the mutex protecting global_rntm.
bli_pthread_mutex_lock( &global_rntm_mutex );
*rntm = global_rntm;
// Release the mutex protecting global_rntm.
bli_pthread_mutex_unlock( &global_rntm_mutex );
}
// -----------------------------------------------------------------------------
void bli_rntm_set_ways_for_op
(
opid_t l3_op,
side_t side,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
// Set the number of ways for each loop, if needed, depending on what
// kind of information is already stored in the rntm_t object.
bli_rntm_set_ways_from_rntm( m, n, k, rntm );
#if 0
printf( "bli_rntm_set_ways_for_op()\n" );
bli_rntm_print( rntm );
#endif
// Now modify the number of ways, if necessary, based on the operation.
if ( l3_op == BLIS_TRMM ||
l3_op == BLIS_TRSM )
{
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
// Notice that, if we do need to update the ways, we don't need to
// update the num_threads field since we only reshuffle where the
// parallelism is extracted, not the total amount of parallelism.
if ( l3_op == BLIS_TRMM )
{
// We reconfigure the parallelism extracted from trmm_r due to a
// dependency in the jc loop. (NOTE: This dependency does not exist
// for trmm3.)
if ( bli_is_left( side ) )
{
bli_rntm_set_ways_only
(
jc,
pc,
ic,
jr,
ir,
rntm
);
}
else // if ( bli_is_right( side ) )
{
bli_rntm_set_ways_only
(
1,
pc,
ic,
jr * jc,
ir,
rntm
);
}
}
else if ( l3_op == BLIS_TRSM )
{
//printf( "bli_rntm_set_ways_for_op(): jc%d ic%d jr%d\n", (int)jc, (int)ic, (int)jr );
if ( bli_is_left( side ) )
{
bli_rntm_set_ways_only
(
jc,
1,
ic * pc,
jr * ir,
1,
rntm
);
}
else // if ( bli_is_right( side ) )
{
bli_rntm_set_ways_only
(
1,
1,
ic * pc * jc * ir * jr,
1,
1,
rntm
);
}
}
}
}
void bli_rntm_set_ways_from_rntm
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
bool auto_factor = FALSE;
#ifdef BLIS_ENABLE_MULTITHREADING
bool nt_set = FALSE;
bool ways_set = FALSE;
// If the rntm was fed in as a copy of the global runtime via
// bli_rntm_init_from_global(), we know that either:
// - the num_threads field is -1 and all of the ways are -1;
// - the num_threads field is -1 and all of the ways are set;
// - the num_threads field is set and all of the ways are -1.
// However, we can't be sure that a user-provided rntm_t isn't
// initialized uncleanly. So here we have to enforce some rules
// to get the rntm_t into a predictable state.
// First, we establish whether or not the number of threads is set.
if ( nt > 0 ) nt_set = TRUE;
// Take this opportunity to set the auto_factor field.
if ( nt_set ) auto_factor = TRUE;
// Next, we establish whether or not any of the ways of parallelism
// for each loop were set. If any of the ways are set (positive), we
// then we assume the user wanted to use those positive values and
// default the non-positive values to 1.
if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
{
ways_set = TRUE;
if ( jc < 1 ) jc = 1;
if ( pc < 1 ) pc = 1;
if ( ic < 1 ) ic = 1;
if ( jr < 1 ) jr = 1;
if ( ir < 1 ) ir = 1;
}
// Now we use the values of nt_set and ways_set to determine how to
// interpret the original values we found in the rntm_t object.
if ( ways_set == TRUE )
{
// If the ways were set, then we use the values that were given
// and interpreted above (we set any non-positive value to 1).
// The only thing left to do is calculate the correct number of
// threads.
nt = jc * pc * ic * jr * ir;
}
else if ( ways_set == FALSE && nt_set == TRUE )
{
// If the ways were not set but the number of thread was set, then
// we attempt to automatically generate a thread factorization that
// will work given the problem size.
#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
// If use of prime numbers is disallowed for automatic thread
// factorizations, we first check if the number of threads requested
// is prime. If it is prime, and it exceeds a minimum threshold, then
// we reduce the number of threads by one so that the number is not
// prime. This will allow for automatic thread factorizations to span
// two dimensions (loops), which tends to be more efficient.
if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
#endif
pc = 1;
//printf( "m n = %d %d BLIS_THREAD_RATIO_M _N = %d %d\n", (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, (int)BLIS_THREAD_RATIO_N );
bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M,
n*BLIS_THREAD_RATIO_N, &ic, &jc );
//printf( "jc ic = %d %d\n", (int)jc, (int)ic );
for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- )
{
if ( ic % ir == 0 ) { ic /= ir; break; }
}
for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- )
{
if ( jc % jr == 0 ) { jc /= jr; break; }
}
}
else // if ( ways_set == FALSE && nt_set == FALSE )
{
// If neither the ways nor the number of threads were set, then
// the rntm was not meaningfully changed since initialization,
// and thus we'll default to single-threaded execution.
nt = 1;
jc = pc = ic = jr = ir = 1;
}
#else
// When multithreading is disabled, always set the rntm_t ways
// values to 1.
nt = 1;
jc = pc = ic = jr = ir = 1;
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
}
void bli_rntm_set_ways_from_rntm_sup
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
)
{
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
bool auto_factor = FALSE;
#ifdef BLIS_ENABLE_MULTITHREADING
bool nt_set = FALSE;
bool ways_set = FALSE;
// If the rntm was fed in as a copy of the global runtime via
// bli_rntm_init_from_global(), we know that either:
// - the num_threads field is -1 and all of the ways are -1;
// - the num_threads field is -1 and all of the ways are set;
// - the num_threads field is set and all of the ways are -1.
// However, we can't be sure that a user-provided rntm_t isn't
// initialized uncleanly. So here we have to enforce some rules
// to get the rntm_t into a predictable state.
// First, we establish whether or not the number of threads is set.
if ( nt > 0 ) nt_set = TRUE;
// Take this opportunity to set the auto_factor field.
if ( nt_set ) auto_factor = TRUE;
// Next, we establish whether or not any of the ways of parallelism
// for each loop were set. If any of the ways are set (positive), we
// then we assume the user wanted to use those positive values and
// default the non-positive values to 1.
if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 )
{
ways_set = TRUE;
if ( jc < 1 ) jc = 1;
if ( pc < 1 ) pc = 1;
if ( ic < 1 ) ic = 1;
if ( jr < 1 ) jr = 1;
if ( ir < 1 ) ir = 1;
}
// Now we use the values of nt_set and ways_set to determine how to
// interpret the original values we found in the rntm_t object.
if ( ways_set == TRUE )
{
// If the ways were set, then we use the values that were given
// and interpreted above (we set any non-positive value to 1).
// The only thing left to do is calculate the correct number of
// threads.
nt = jc * pc * ic * jr * ir;
}
else if ( ways_set == FALSE && nt_set == TRUE )
{
// If the ways were not set but the number of thread was set, then
// we attempt to automatically generate a thread factorization that
// will work given the problem size.
#ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS
// If use of prime numbers is disallowed for automatic thread
// factorizations, we first check if the number of threads requested
// is prime. If it is prime, and it exceeds a minimum threshold, then
// we reduce the number of threads by one so that the number is not
// prime. This will allow for automatic thread factorizations to span
// two dimensions (loops), which tends to be more efficient.
if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1;
#endif
pc = 1;
//bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M,
// n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc );
bli_thread_partition_2x2( nt, m,
n, &ic, &jc );
//printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", (int)jc, (int)ic );
#if 0
for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- )
{
if ( ic % ir == 0 ) { ic /= ir; break; }
}
for ( jr = BLIS_THREAD_SUP_MAX_JR ; jr > 1 ; jr-- )
{
if ( jc % jr == 0 ) { jc /= jr; break; }
}
#else
ir = 1;
jr = 1;
#endif
}
else // if ( ways_set == FALSE && nt_set == FALSE )
{
// If neither the ways nor the number of threads were set, then
// the rntm was not meaningfully changed since initialization,
// and thus we'll default to single-threaded execution.
nt = 1;
jc = pc = ic = jr = ir = 1;
}
#else
// When multithreading is disabled, always set the rntm_t ways
// values to 1.
nt = 1;
jc = pc = ic = jr = ir = 1;
#endif
// Save the results back in the runtime object.
bli_rntm_set_auto_factor_only( auto_factor, rntm );
bli_rntm_set_num_threads_only( nt, rntm );
bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
}
void bli_rntm_print
(
rntm_t* rntm
)
{
dim_t af = bli_rntm_auto_factor( rntm );
dim_t nt = bli_rntm_num_threads( rntm );
dim_t jc = bli_rntm_jc_ways( rntm );
dim_t pc = bli_rntm_pc_ways( rntm );
dim_t ic = bli_rntm_ic_ways( rntm );
dim_t jr = bli_rntm_jr_ways( rntm );
dim_t ir = bli_rntm_ir_ways( rntm );
printf( "rntm contents nt jc pc ic jr ir\n" );
printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af,
(int)nt, (int)jc, (int)pc,
(int)ic, (int)jr, (int)ir );
}
// -----------------------------------------------------------------------------
dim_t bli_rntm_calc_num_threads_in
(
bszid_t* restrict bszid_cur,
rntm_t* restrict rntm
)
{
/* // bp algorithm:
bszid_t bszids[7] = { BLIS_NC, // level 0: 5th loop
BLIS_KC, // level 1: 4th loop
BLIS_NO_PART, // level 2: pack B
BLIS_MC, // level 3: 3rd loop
BLIS_NO_PART, // level 4: pack A
BLIS_NR, // level 5: 2nd loop
BLIS_MR, // level 6: 1st loop
BLIS_KR // level 7: ukr loop
... // pb algorithm:
BLIS_NR, // level 5: 2nd loop
BLIS_MR, // level 6: 1st loop
BLIS_KR // level 7: ukr loop
}; */
dim_t n_threads_in = 1;
// Starting with the current element of the bszids array (pointed
// to by bszid_cur), multiply all of the corresponding ways of
// parallelism.
for ( ; *bszid_cur != BLIS_KR; bszid_cur++ )
{
const bszid_t bszid = *bszid_cur;
//if ( bszid == BLIS_KR ) break;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
{
const dim_t cur_way = bli_rntm_ways_for( bszid, rntm );
n_threads_in *= cur_way;
}
}
return n_threads_in;
}
#if 0
for ( ; *bszid_cur != BLIS_KR; bszid_cur++ )
{
const bszid_t bszid = *bszid_cur;
dim_t cur_way = 1;
// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
// BLIS_NO_PART.
if ( bszid != BLIS_NO_PART )
cur_way = bli_rntm_ways_for( bszid, rntm );
else
cur_way = 1;
n_threads_in *= cur_way;
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_rntm.h 0000664 0000000 0000000 00000024540 14634250137 0021617 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2016, Hewlett Packard Enterprise Development LP
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_RNTM_H
#define BLIS_RNTM_H
// Runtime object type (defined in bli_type_defs.h)
/*
typedef struct rntm_s
{
bool auto_factor;
dim_t num_threads;
dim_t* thrloop;
bool pack_a;
bool pack_b;
bool l3_sup;
pool_t* sba_pool;
pba_t* pba;
} rntm_t;
*/
//
// -- rntm_t query (public API) ------------------------------------------------
//
BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm )
{
return rntm->auto_factor;
}
BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm )
{
return rntm->num_threads;
}
BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm )
{
return rntm->thrloop[ bszid ];
}
BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm )
{
return bli_rntm_ways_for( BLIS_NC, rntm );
}
BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm )
{
return bli_rntm_ways_for( BLIS_KC, rntm );
}
BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm )
{
return bli_rntm_ways_for( BLIS_MC, rntm );
}
BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm )
{
return bli_rntm_ways_for( BLIS_NR, rntm );
}
BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm )
{
return bli_rntm_ways_for( BLIS_MR, rntm );
}
BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm )
{
return bli_rntm_ways_for( BLIS_KR, rntm );
}
BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm )
{
return ( bool )( rntm->pack_a );
}
BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm )
{
return ( bool )( rntm->pack_b );
}
BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm )
{
return rntm->l3_sup;
}
//
// -- rntm_t query (internal use only) -----------------------------------------
//
BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm )
{
return rntm->sba_pool;
}
BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm )
{
return rntm->pba;
}
#if 0
BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 )
{
const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 );
const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 );
const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 );
const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 );
const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 );
const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 );
const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 );
if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE;
else return FALSE;
}
#endif
//
// -- rntm_t modification (internal use only) ----------------------------------
//
BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm )
{
rntm->auto_factor = auto_factor;
}
BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm )
{
rntm->num_threads = nt;
}
BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm )
{
rntm->thrloop[ loop ] = n_ways;
}
BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm )
{
bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm );
}
BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm )
{
bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm );
}
BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm )
{
bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm );
}
BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm )
{
bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm );
}
BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm )
{
bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm );
}
BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm )
{
bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm );
}
BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm )
{
// Record the number of ways of parallelism per loop.
bli_rntm_set_jc_ways_only( jc, rntm );
bli_rntm_set_pc_ways_only( pc, rntm );
bli_rntm_set_ic_ways_only( ic, rntm );
bli_rntm_set_jr_ways_only( jr, rntm );
bli_rntm_set_ir_ways_only( ir, rntm );
bli_rntm_set_pr_ways_only( 1, rntm );
}
BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm )
{
rntm->sba_pool = sba_pool;
}
BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm )
{
rntm->pba = pba;
}
BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm )
{
bli_rntm_set_num_threads_only( -1, rntm );
}
BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm )
{
bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm );
}
BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm )
{
bli_rntm_set_sba_pool( NULL, rntm );
}
BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
{
bli_rntm_set_pba( NULL, rntm );
}
//
// -- rntm_t modification (public API) -----------------------------------------
//
BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm )
{
// Record the total number of threads to use.
bli_rntm_set_num_threads_only( nt, rntm );
// Set the individual ways of parallelism to default states.
bli_rntm_clear_ways_only( rntm );
}
BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm )
{
// Record the number of ways of parallelism per loop.
bli_rntm_set_jc_ways_only( jc, rntm );
bli_rntm_set_pc_ways_only( pc, rntm );
bli_rntm_set_ic_ways_only( ic, rntm );
bli_rntm_set_jr_ways_only( jr, rntm );
bli_rntm_set_ir_ways_only( ir, rntm );
bli_rntm_set_pr_ways_only( 1, rntm );
// Set the num_threads field to a default state.
bli_rntm_clear_num_threads_only( rntm );
}
BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm )
{
// Set the bool indicating whether matrix A should be packed.
rntm->pack_a = pack_a;
}
BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm )
{
// Set the bool indicating whether matrix B should be packed.
rntm->pack_b = pack_b;
}
BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm )
{
// Set the bool indicating whether level-3 sup handling is enabled.
rntm->l3_sup = l3_sup;
}
BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm )
{
bli_rntm_set_l3_sup( TRUE, rntm );
}
BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm )
{
bli_rntm_set_l3_sup( FALSE, rntm );
}
//
// -- rntm_t modification (internal use only) ----------------------------------
//
BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm )
{
bli_rntm_set_pack_a( FALSE, rntm );
}
BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm )
{
bli_rntm_set_pack_b( FALSE, rntm );
}
BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
{
bli_rntm_set_l3_sup( TRUE, rntm );
}
//
// -- rntm_t initialization ----------------------------------------------------
//
// NOTE: Initialization is not necessary as long the user calls at least ONE
// of the public "set" accessors, each of which guarantees that the rntm_t
// will be in a good state upon return.
#define BLIS_RNTM_INITIALIZER \
{ \
.auto_factor = TRUE, \
.num_threads = -1, \
.thrloop = { -1, -1, -1, -1, -1, -1 }, \
.pack_a = FALSE, \
.pack_b = FALSE, \
.l3_sup = TRUE, \
.sba_pool = NULL, \
.pba = NULL, \
} \
BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
{
bli_rntm_set_auto_factor_only( TRUE, rntm );
bli_rntm_clear_num_threads_only( rntm );
bli_rntm_clear_ways_only( rntm );
bli_rntm_clear_pack_a( rntm );
bli_rntm_clear_pack_b( rntm );
bli_rntm_clear_l3_sup( rntm );
bli_rntm_clear_sba_pool( rntm );
bli_rntm_clear_pba( rntm );
}
// -- rntm_t total thread calculation ------------------------------------------
BLIS_INLINE dim_t bli_rntm_calc_num_threads
(
rntm_t* restrict rntm
)
{
dim_t n_threads;
n_threads = bli_rntm_ways_for( BLIS_NC, rntm );
n_threads *= bli_rntm_ways_for( BLIS_KC, rntm );
n_threads *= bli_rntm_ways_for( BLIS_MC, rntm );
n_threads *= bli_rntm_ways_for( BLIS_NR, rntm );
n_threads *= bli_rntm_ways_for( BLIS_MR, rntm );
return n_threads;
}
// -----------------------------------------------------------------------------
// Function prototypes
BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
(
opid_t l3_op,
side_t side,
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
);
void bli_rntm_set_ways_from_rntm
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
);
void bli_rntm_set_ways_from_rntm_sup
(
dim_t m,
dim_t n,
dim_t k,
rntm_t* rntm
);
void bli_rntm_print
(
rntm_t* rntm
);
dim_t bli_rntm_calc_num_threads_in
(
bszid_t* restrict bszid_cur,
rntm_t* restrict rntm
);
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_sba.c 0000664 0000000 0000000 00000013432 14634250137 0021375 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// Statically initialize the mutex within the small block allocator.
// Note that the sba is an apool_t of array_t of pool_t.
static apool_t sba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER };
apool_t* bli_sba_query( void )
{
return &sba;
}
// -----------------------------------------------------------------------------
void bli_sba_init( void )
{
bli_apool_init( &sba );
}
void bli_sba_finalize( void )
{
bli_apool_finalize( &sba );
}
void* bli_sba_acquire
(
rntm_t* restrict rntm,
siz_t req_size
)
{
void* block;
err_t r_val;
#ifdef BLIS_ENABLE_SBA_POOLS
if ( rntm == NULL )
{
block = bli_malloc_intl( req_size, &r_val );
}
else
{
pblk_t pblk;
// Query the small block pool from the rntm.
pool_t* restrict pool = bli_rntm_sba_pool( rntm );
// We don't expect NULL sba_pool pointers in the normal course of BLIS
// operation. However, there are rare instances where it is convenient
// to support use of bli_sba_acquire() without having to pass in a valid
// sba pool data structure. The case that inspired this branch was the
// gemm_ukr and related test modules in the BLIS testsuite. (There, it
// is convenient to not have to checkout an array_t from the sba, and it
// does no harm since the malloc() happens outside of the region that
// would be timed.)
if ( pool == NULL )
{
block = bli_malloc_intl( req_size, &r_val );
}
else
{
// Query the block_size of the pool_t so that we can request the exact
// size present.
const siz_t block_size = bli_pool_block_size( pool );
// Sanity check: Make sure the requested size is no larger than the
// block_size field of the pool.
if ( block_size < req_size )
{
printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
( int )block_size, ( int )req_size );
bli_abort();
}
// Check out a block using the block_size queried above.
bli_pool_checkout_block( block_size, &pblk, pool );
// The block address is stored within the pblk_t.
block = bli_pblk_buf( &pblk );
}
}
#else
block = bli_malloc_intl( req_size, &r_val );
#endif
// Return the address obtained from the pblk_t.
return block;
}
void bli_sba_release
(
rntm_t* restrict rntm,
void* restrict block
)
{
#ifdef BLIS_ENABLE_SBA_POOLS
if ( rntm == NULL )
{
bli_free_intl( block );
}
else
{
pblk_t pblk;
// Query the small block pool from the rntm.
pool_t* restrict pool = bli_rntm_sba_pool( rntm );
if ( pool == NULL )
{
bli_free_intl( block );
}
else
{
// Query the block_size field from the pool. This is not super-important
// for this particular application of the pool_t (that is, the "leaf"
// component of the sba), but it seems like good housekeeping to maintain
// the block_size field of the pblk_t in case its ever needed/read.
const siz_t block_size = bli_pool_block_size( pool );
// Embed the block's memory address into a pblk_t, along with the
// block_size queried from the pool.
bli_pblk_set_buf( block, &pblk );
bli_pblk_set_block_size( block_size, &pblk );
// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
// a local variable since its contents are copied into the pool's internal
// data structure--an array of pblk_t.)
bli_pool_checkin_block( &pblk, pool );
}
}
#else
bli_free_intl( block );
#endif
}
array_t* bli_sba_checkout_array
(
const siz_t n_threads
)
{
#ifndef BLIS_ENABLE_SBA_POOLS
return NULL;
#endif
return bli_apool_checkout_array( n_threads, &sba );
}
void bli_sba_checkin_array
(
array_t* restrict array
)
{
#ifndef BLIS_ENABLE_SBA_POOLS
return;
#endif
bli_apool_checkin_array( array, &sba );
}
void bli_sba_rntm_set_pool
(
siz_t index,
array_t* restrict array,
rntm_t* restrict rntm
)
{
#ifndef BLIS_ENABLE_SBA_POOLS
bli_rntm_set_sba_pool( NULL, rntm );
return;
#endif
// Query the pool_t* in the array_t corresponding to index.
pool_t* restrict pool = bli_apool_array_elem( index, array );
// Embed the pool_t* into the rntm_t.
bli_rntm_set_sba_pool( pool, rntm );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_sba.h 0000664 0000000 0000000 00000004516 14634250137 0021405 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_SBA_H
#define BLIS_SBA_H
apool_t* bli_sba_query( void );
// -----------------------------------------------------------------------------
void bli_sba_init( void );
void bli_sba_finalize( void );
array_t* bli_sba_checkout_array
(
const siz_t n_threads
);
void bli_sba_checkin_array
(
array_t* restrict array
);
void bli_sba_rntm_set_pool
(
siz_t index,
array_t* restrict array,
rntm_t* restrict rntm
);
void* bli_sba_acquire
(
rntm_t* restrict rntm,
siz_t req_size
);
void bli_sba_release
(
rntm_t* restrict rntm,
void* restrict block
);
#endif
cython-blis-1.0.0/blis/_src/frame/base/bli_setgetijm.c 0000664 0000000 0000000 00000011327 14634250137 0022624 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
typedef void (*setijm_fp)
(
double ar,
double ai,
dim_t i,
dim_t j,
void* restrict b, inc_t rs, inc_t cs
);
static setijm_fp GENARRAY(ftypes_setijm,setijm);
err_t bli_setijm
(
double ar,
double ai,
dim_t i,
dim_t j,
obj_t* b
)
{
dim_t m = bli_obj_length( b );
dim_t n = bli_obj_width( b );
dim_t rs = bli_obj_row_stride( b );
dim_t cs = bli_obj_col_stride( b );
num_t dt = bli_obj_dt( b );
// Return error if i or j is beyond bounds of the matrix/vector.
if ( i < 0 || m <= i ) return BLIS_FAILURE;
if ( j < 0 || n <= j ) return BLIS_FAILURE;
// Don't modify scalar constants.
if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE;
// Query the pointer to the buffer at the adjusted offsets.
void* b_p = bli_obj_buffer_at_off( b );
// Index into the function pointer array.
setijm_fp f = ftypes_setijm[ dt ];
// Invoke the type-specific function.
f
(
ar,
ai,
i,
j,
b_p, rs, cs
);
return BLIS_SUCCESS;
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
double ar, \
double ai, \
dim_t i, \
dim_t j, \
void* restrict b, inc_t rs, inc_t cs \
) \
{ \
ctype* restrict b_cast = ( ctype* )b; \
\
ctype* restrict b_ij = b_cast + (i )*rs + (j )*cs; \
\
PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \
}
INSERT_GENTFUNC_BASIC0( setijm )
// -----------------------------------------------------------------------------
typedef void (*getijm_fp)
(
dim_t i,
dim_t j,
void* restrict b, inc_t rs, inc_t cs,
double* ar,
double* ai
);
static getijm_fp GENARRAY(ftypes_getijm,getijm);
err_t bli_getijm
(
dim_t i,
dim_t j,
obj_t* b,
double* ar,
double* ai
)
{
dim_t m = bli_obj_length( b );
dim_t n = bli_obj_width( b );
dim_t rs = bli_obj_row_stride( b );
dim_t cs = bli_obj_col_stride( b );
num_t dt = bli_obj_dt( b );
// Return error if i or j is beyond bounds of the matrix/vector.
if ( i < 0 || m <= i ) return BLIS_FAILURE;
if ( j < 0 || n <= j ) return BLIS_FAILURE;
// Disallow access into scalar constants.
if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE;
// Query the pointer to the buffer at the adjusted offsets.
void* b_p = bli_obj_buffer_at_off( b );
// Index into the function pointer array.
getijm_fp f = ftypes_getijm[ dt ];
// Invoke the type-specific function.
f
(
i,
j,
b_p, rs, cs,
ar,
ai
);
return BLIS_SUCCESS;
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t i, \
dim_t j, \
void* restrict b, inc_t rs, inc_t cs, \
double* ar, \
double* ai \
) \
{ \
ctype* restrict b_cast = ( ctype* )b; \
\
ctype* restrict b_ij = b_cast + (i )*rs + (j )*cs; \
\
PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \
}
INSERT_GENTFUNC_BASIC0( getijm )
cython-blis-1.0.0/blis/_src/frame/base/bli_setgetijm.h 0000664 0000000 0000000 00000005154 14634250137 0022632 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS err_t bli_setijm
(
double ar,
double ai,
dim_t i,
dim_t j,
obj_t* b
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
double ar, \
double ai, \
dim_t i, \
dim_t j, \
void* restrict b, inc_t rs, inc_t cs \
);
INSERT_GENTPROT_BASIC0( setijm )
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS err_t bli_getijm
(
dim_t i,
dim_t j,
obj_t* b,
double* ar,
double* ai
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
dim_t i, \
dim_t j, \
void* restrict b, inc_t rs, inc_t cs, \
double* ar, \
double* ai \
);
INSERT_GENTPROT_BASIC0( getijm )
cython-blis-1.0.0/blis/_src/frame/base/bli_setgetijv.c 0000664 0000000 0000000 00000010434 14634250137 0022633 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
typedef void (*setijv_fp)
(
double ar,
double ai,
dim_t i,
void* restrict x, inc_t incx
);
static setijv_fp GENARRAY(ftypes_setijv,setijv);
err_t bli_setijv
(
double ar,
double ai,
dim_t i,
obj_t* x
)
{
dim_t n = bli_obj_vector_dim( x );
dim_t incx = bli_obj_vector_inc( x );
num_t dt = bli_obj_dt( x );
// Return error if i is beyond bounds of the vector.
if ( i < 0 || n <= i ) return BLIS_FAILURE;
// Don't modify scalar constants.
if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE;
// Query the pointer to the buffer at the adjusted offsets.
void* x_p = bli_obj_buffer_at_off( x );
// Index into the function pointer array.
setijv_fp f = ftypes_setijv[ dt ];
// Invoke the type-specific function.
f
(
ar,
ai,
i,
x_p, incx
);
return BLIS_SUCCESS;
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
double ar, \
double ai, \
dim_t i, \
void* restrict x, inc_t incx \
) \
{ \
ctype* restrict x_cast = ( ctype* )x; \
\
ctype* restrict x_i = x_cast + (i )*incx; \
\
PASTEMAC2(z,ch,sets)( ar, ai, *x_i ); \
}
INSERT_GENTFUNC_BASIC0( setijv )
// -----------------------------------------------------------------------------
typedef void (*getijv_fp)
(
dim_t i,
void* restrict x, inc_t incx,
double* ar,
double* ai
);
static getijv_fp GENARRAY(ftypes_getijv,getijv);
err_t bli_getijv
(
dim_t i,
obj_t* x,
double* ar,
double* ai
)
{
dim_t n = bli_obj_vector_dim( x );
dim_t incx = bli_obj_vector_inc( x );
num_t dt = bli_obj_dt( x );
// Return error if i is beyond bounds of the vector.
if ( i < 0 || n <= i ) return BLIS_FAILURE;
// Disallow access into scalar constants.
if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE;
// Query the pointer to the buffer at the adjusted offsets.
void* x_p = bli_obj_buffer_at_off( x );
// Index into the function pointer array.
getijv_fp f = ftypes_getijv[ dt ];
// Invoke the type-specific function.
f
(
i,
x_p, incx,
ar,
ai
);
return BLIS_SUCCESS;
}
#undef GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
( \
dim_t i, \
void* restrict x, inc_t incx, \
double* ar, \
double* ai \
) \
{ \
ctype* restrict x_cast = ( ctype* )x; \
\
ctype* restrict x_i = x_cast + (i )*incx; \
\
PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \
}
INSERT_GENTFUNC_BASIC0( getijv )
cython-blis-1.0.0/blis/_src/frame/base/bli_setgetijv.h 0000664 0000000 0000000 00000005001 14634250137 0022632 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS err_t bli_setijv
(
double ar,
double ai,
dim_t i,
obj_t* x
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
double ar, \
double ai, \
dim_t i, \
void* restrict x, inc_t incx \
);
INSERT_GENTPROT_BASIC0( setijv )
// -----------------------------------------------------------------------------
BLIS_EXPORT_BLIS err_t bli_getijv
(
dim_t i,
obj_t* x,
double* ar,
double* ai
);
#undef GENTPROT
#define GENTPROT( ctype, ch, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
( \
dim_t i, \
void* restrict b, inc_t incx, \
double* ar, \
double* ai \
);
INSERT_GENTPROT_BASIC0( getijv )
cython-blis-1.0.0/blis/_src/frame/base/bli_setri.c 0000664 0000000 0000000 00000011336 14634250137 0021757 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// -- setr ---------------------------------------------------------------------
void bli_setrm
(
obj_t* alpha,
obj_t* b
)
{
obj_t alpha_real;
obj_t br;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_setm_check( alpha, b );
// Initialize a local scalar, alpha_real, using the real projection
// of the datatype of b.
bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( b ),
&alpha_real );
// Copy/typecast alpha to alpha_real. This discards the imaginary
// part of alpha (if it is complex).
bli_copysc( alpha, &alpha_real );
// Acquire an alias to the real part of b.
bli_obj_real_part( b, &br );
// Use setm to set the real part of b to alpha_real.
bli_setm( &alpha_real, &br );
}
void bli_setrv
(
obj_t* alpha,
obj_t* x
)
{
obj_t alpha_real;
obj_t xr;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_setv_check( alpha, x );
// Initialize a local scalar, alpha_real, using the real projection
// of the datatype of x.
bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( x ),
&alpha_real );
// Copy/typecast alpha to alpha_real. This discards the imaginary
// part of alpha (if it is complex).
bli_copysc( alpha, &alpha_real );
// Acquire an alias to the real part of x.
bli_obj_real_part( x, &xr );
// Use setv to set the real part of x to alpha_real.
bli_setv( &alpha_real, &xr );
}
// -- seti ---------------------------------------------------------------------
void bli_setim
(
obj_t* alpha,
obj_t* b
)
{
obj_t alpha_real;
obj_t bi;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_setm_check( alpha, b );
// If the object is real, return early.
if ( bli_obj_is_real( b ) ) return;
// Initialize a local scalar, alpha_real, using the real projection
// of the datatype of b.
bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( b ),
&alpha_real );
// Copy/typecast alpha to alpha_real. This discards the imaginary
// part of alpha (if it is complex).
bli_copysc( alpha, &alpha_real );
// Acquire an alias to the imaginary part of b.
bli_obj_imag_part( b, &bi );
// Use setm to set the imaginary part of b to alpha_real.
bli_setm( &alpha_real, &bi );
}
void bli_setiv
(
obj_t* alpha,
obj_t* x
)
{
obj_t alpha_real;
obj_t xi;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_setv_check( alpha, x );
// If the object is real, return early.
if ( bli_obj_is_real( x ) ) return;
// Initialize a local scalar, alpha_real, using the real projection
// of the datatype of x.
bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( x ),
&alpha_real );
// Copy/typecast alpha to alpha_real. This discards the imaginary
// part of alpha (if it is complex).
bli_copysc( alpha, &alpha_real );
// Acquire an alias to the imaginary part of x.
bli_obj_imag_part( x, &xi );
// Use setm to set the imaginary part of x to alpha_real.
bli_setm( &alpha_real, &xi );
}
cython-blis-1.0.0/blis/_src/frame/base/bli_setri.h 0000664 0000000 0000000 00000004201 14634250137 0021755 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// -- setr ---------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_setrm
(
obj_t* alpha,
obj_t* b
);
BLIS_EXPORT_BLIS void bli_setrv
(
obj_t* alpha,
obj_t* x
);
// -- seti ---------------------------------------------------------------------
BLIS_EXPORT_BLIS void bli_setim
(
obj_t* alpha,
obj_t* b
);
BLIS_EXPORT_BLIS void bli_setiv
(
obj_t* alpha,
obj_t* x
);
cython-blis-1.0.0/blis/_src/frame/base/bli_string.c 0000664 0000000 0000000 00000003562 14634250137 0022141 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_string_mkupper( char* s )
{
// Convert the string to uppercase.
for ( ; *s != '\0'; s++ )
{
// Convert to unsigned in case one of the chars is negative.
*s = toupper( ( unsigned char ) *s );
}
}
cython-blis-1.0.0/blis/_src/frame/base/bli_string.h 0000664 0000000 0000000 00000003255 14634250137 0022145 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_string_mkupper( char* s );
cython-blis-1.0.0/blis/_src/frame/base/bli_winsys.c 0000664 0000000 0000000 00000004511 14634250137 0022162 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef _MSC_VER
#include
#else
#include
#endif
#if 0
// NOTE: This function is no longer needed by BLIS since BLIS no longer
// makes any attempt to change environment variables; rather, it only
// reads them. We can keep it here for some time before removing it,
// though.
int bli_setenv( const char *name, const char *value, int overwrite )
{
#ifdef _MSC_VER
// Windows.
_putenv_s( name, value );
#else
// Everything else: Linux, OS X, etc.
setenv( name, value, overwrite );
#endif
}
#endif
void bli_sleep( unsigned int secs )
{
#ifdef _MSC_VER
// Windows.
Sleep( secs * 1000 );
#else
// Everything else: Linux, OS X, etc.
sleep( secs );
#endif
}
cython-blis-1.0.0/blis/_src/frame/base/bli_winsys.h 0000664 0000000 0000000 00000003411 14634250137 0022165 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//int bli_setenv( const char *name, const char *value, int overwrite );
BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs );
cython-blis-1.0.0/blis/_src/frame/base/cast/ 0000775 0000000 0000000 00000000000 14634250137 0020565 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/cast/bli_castm.c 0000664 0000000 0000000 00000015307 14634250137 0022674 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// NOTE: This is one of the few functions in BLIS that is defined
// with heterogeneous type support. This is done so that we have
// an operation that can be used to typecast (copy-cast) a matrix
// of one datatype to a scalar of another datatype.
typedef void (*FUNCPTR_T)
(
trans_t transa,
dim_t m,
dim_t n,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T GENARRAY2_ALL(ftypes,castm);
//
// Define object-based interface.
//
void bli_castm
(
obj_t* a,
obj_t* b
)
{
num_t dt_a = bli_obj_dt( a );
num_t dt_b = bli_obj_dt( b );
trans_t transa = bli_obj_conjtrans_status( a );
dim_t m = bli_obj_length( b );
dim_t n = bli_obj_width( b );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a = bli_obj_row_stride( a );
inc_t cs_a = bli_obj_col_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t cs_b = bli_obj_col_stride( b );
FUNCPTR_T f;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_castm_check( a, b );
#if 0
if ( bli_obj_dt( a ) == bli_obj_dt( b ) )
{
// If a and b share the same datatype, we can simply use copym.
bli_copym( a, b );
return;
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a][dt_b];
// Invoke the void pointer-based function.
f
(
transa,
m,
n,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b
);
}
// -----------------------------------------------------------------------------
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC2
#define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \
\
void PASTEMAC2(cha,chb,opname) \
( \
trans_t transa, \
dim_t m, \
dim_t n, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype_a* restrict a_cast = a; \
ctype_b* restrict b_cast = b; \
conj_t conja; \
dim_t n_iter; \
dim_t n_elem; \
inc_t lda, inca; \
inc_t ldb, incb; \
dim_t j, i; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_2m \
( \
transa, \
m, n, rs_a, cs_a, rs_b, cs_b, \
&n_elem, &n_iter, &inca, &lda, &incb, &ldb \
); \
\
/* Extract the conjugation component from the transa parameter. */ \
conja = bli_extract_conj( transa ); \
\
if ( bli_is_conj( conja ) ) \
{ \
if ( inca == 1 && incb == 1 ) \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copyjs)( a1[i], b1[i] ); \
} \
} \
} \
else \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copyjs)( *a1, *b1 ); \
\
a1 += inca; \
b1 += incb; \
} \
} \
} \
} \
else \
{ \
if ( inca == 1 && incb == 1 ) \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copys)( a1[i], b1[i] ); \
} \
} \
} \
else \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copys)( *a1, *b1 ); \
\
a1 += inca; \
b1 += incb; \
} \
} \
} \
} \
}
INSERT_GENTFUNC2_BASIC0( castm )
INSERT_GENTFUNC2_MIXDP0( castm )
// -----------------------------------------------------------------------------
//
// Define object-based _check() function.
//
void bli_castm_check
(
obj_t* a,
obj_t* b
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
// Check structure.
// NOTE: We enforce general structure for now in order to simplify the
// implementation.
bli_check_general_object( a );
bli_check_error_code( e_val );
bli_check_general_object( b );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( a, b );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/cast/bli_castm.h 0000664 0000000 0000000 00000004472 14634250137 0022702 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interface.
//
BLIS_EXPORT_BLIS void bli_castm
(
obj_t* a,
obj_t* b
);
//
// Prototype BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTPROT2
#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
( \
trans_t transa, \
dim_t m, \
dim_t n, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT2_BASIC0( castm )
INSERT_GENTPROT2_MIXDP0( castm )
//
// Prototype object-based _check() function.
//
void bli_castm_check
(
obj_t* a,
obj_t* b
);
cython-blis-1.0.0/blis/_src/frame/base/cast/bli_castnzm.c 0000664 0000000 0000000 00000015333 14634250137 0023243 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// NOTE: This is one of the few functions in BLIS that is defined
// with heterogeneous type support. This is done so that we have
// an operation that can be used to typecast (copy-cast) a matrix
// of one datatype to a scalar of another datatype.
typedef void (*FUNCPTR_T)
(
trans_t transa,
dim_t m,
dim_t n,
void* restrict a, inc_t rs_a, inc_t cs_a,
void* restrict b, inc_t rs_b, inc_t cs_b
);
static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
//
// Define object-based interface.
//
void bli_castnzm
(
obj_t* a,
obj_t* b
)
{
num_t dt_a = bli_obj_dt( a );
num_t dt_b = bli_obj_dt( b );
trans_t transa = bli_obj_conjtrans_status( a );
dim_t m = bli_obj_length( b );
dim_t n = bli_obj_width( b );
void* buf_a = bli_obj_buffer_at_off( a );
inc_t rs_a = bli_obj_row_stride( a );
inc_t cs_a = bli_obj_col_stride( a );
void* buf_b = bli_obj_buffer_at_off( b );
inc_t rs_b = bli_obj_row_stride( b );
inc_t cs_b = bli_obj_col_stride( b );
FUNCPTR_T f;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_castnzm_check( a, b );
#if 0
if ( bli_obj_dt( a ) == bli_obj_dt( b ) )
{
// If a and b share the same datatype, we can simply use copym.
bli_copym( a, b );
return;
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_a][dt_b];
// Invoke the void pointer-based function.
f
(
transa,
m,
n,
buf_a, rs_a, cs_a,
buf_b, rs_b, cs_b
);
}
// -----------------------------------------------------------------------------
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC2
#define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \
\
void PASTEMAC2(cha,chb,opname) \
( \
trans_t transa, \
dim_t m, \
dim_t n, \
void* restrict a, inc_t rs_a, inc_t cs_a, \
void* restrict b, inc_t rs_b, inc_t cs_b \
) \
{ \
ctype_a* restrict a_cast = a; \
ctype_b* restrict b_cast = b; \
conj_t conja; \
dim_t n_iter; \
dim_t n_elem; \
inc_t lda, inca; \
inc_t ldb, incb; \
dim_t j, i; \
\
/* Set various loop parameters. */ \
bli_set_dims_incs_2m \
( \
transa, \
m, n, rs_a, cs_a, rs_b, cs_b, \
&n_elem, &n_iter, &inca, &lda, &incb, &ldb \
); \
\
/* Extract the conjugation component from the transa parameter. */ \
conja = bli_extract_conj( transa ); \
\
if ( bli_is_conj( conja ) ) \
{ \
if ( inca == 1 && incb == 1 ) \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copyjnzs)( a1[i], b1[i] ); \
} \
} \
} \
else \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copyjnzs)( *a1, *b1 ); \
\
a1 += inca; \
b1 += incb; \
} \
} \
} \
} \
else \
{ \
if ( inca == 1 && incb == 1 ) \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copynzs)( a1[i], b1[i] ); \
} \
} \
} \
else \
{ \
for ( j = 0; j < n_iter; ++j ) \
{ \
ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \
ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \
\
for ( i = 0; i < n_elem; ++i ) \
{ \
PASTEMAC2(cha,chb,copynzs)( *a1, *b1 ); \
\
a1 += inca; \
b1 += incb; \
} \
} \
} \
} \
}
INSERT_GENTFUNC2_BASIC0( castnzm )
INSERT_GENTFUNC2_MIXDP0( castnzm )
// -----------------------------------------------------------------------------
//
// Define object-based _check() function.
//
void bli_castnzm_check
(
obj_t* a,
obj_t* b
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
// Check structure.
// NOTE: We enforce general structure for now in order to simplify the
// implementation.
bli_check_general_object( a );
bli_check_error_code( e_val );
bli_check_general_object( b );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( a, b );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/cast/bli_castnzm.h 0000664 0000000 0000000 00000004502 14634250137 0023244 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interface.
//
BLIS_EXPORT_BLIS void bli_castnzm
(
obj_t* a,
obj_t* b
);
//
// Prototype BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTPROT2
#define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
( \
trans_t transa, \
dim_t m, \
dim_t n, \
void* a, inc_t rs_a, inc_t cs_a, \
void* b, inc_t rs_b, inc_t cs_b \
);
INSERT_GENTPROT2_BASIC0( castnzm )
INSERT_GENTPROT2_MIXDP0( castnzm )
//
// Prototype object-based _check() function.
//
void bli_castnzm_check
(
obj_t* a,
obj_t* b
);
cython-blis-1.0.0/blis/_src/frame/base/cast/bli_castv.c 0000664 0000000 0000000 00000012051 14634250137 0022676 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
// NOTE: This is one of the few functions in BLIS that is defined
// with heterogeneous type support. This is done so that we have
// an operation that can be used to typecast (copy-cast) a matrix
// of one datatype to a scalar of another datatype.
typedef void (*FUNCPTR_T)
(
conj_t conjx,
dim_t n,
void* restrict x, inc_t inc_x,
void* restrict y, inc_t inc_y
);
static FUNCPTR_T GENARRAY2_ALL(ftypes,castv);
//
// Define object-based interface.
//
void bli_castv
(
obj_t* x,
obj_t* y
)
{
num_t dt_x = bli_obj_dt( x );
num_t dt_y = bli_obj_dt( y );
conj_t conjx = bli_obj_conj_status( x );
dim_t n = bli_obj_vector_dim( x );
void* buf_x = bli_obj_buffer_at_off( x );
inc_t inc_x = bli_obj_vector_inc( x );
void* buf_y = bli_obj_buffer_at_off( y );
inc_t inc_y = bli_obj_vector_inc( y );
FUNCPTR_T f;
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_castv_check( x, y );
#if 0
if ( bli_obj_dt( x ) == bli_obj_dt( y ) )
{
// If x and y share the same datatype, we can simply use copyv.
bli_copyv( x, y );
return;
}
#endif
// Index into the type combination array to extract the correct
// function pointer.
f = ftypes[dt_x][dt_y];
// Invoke the void pointer-based function.
f
(
conjx,
n,
buf_x, inc_x,
buf_y, inc_y
);
}
// -----------------------------------------------------------------------------
//
// Define BLAS-like interfaces with typed operands.
//
#undef GENTFUNC2
#define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
\
void PASTEMAC2(chx,chy,opname) \
( \
conj_t conjx, \
dim_t n, \
void* restrict x, inc_t incx, \
void* restrict y, inc_t incy \
) \
{ \
ctype_x* restrict x1 = x; \
ctype_y* restrict y1 = y; \
dim_t i; \
\
if ( bli_is_conj( conjx ) ) \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
{ \
PASTEMAC2(chx,chy,copyjs)( x1[i], y1[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
{ \
PASTEMAC2(chx,chy,copyjs)( *x1, *y1 ); \
\
x1 += incx; \
y1 += incy; \
} \
} \
} \
else \
{ \
if ( incx == 1 && incy == 1 ) \
{ \
for ( i = 0; i < n; ++i ) \
{ \
PASTEMAC2(chx,chy,copys)( x1[i], y1[i] ); \
} \
} \
else \
{ \
for ( i = 0; i < n; ++i ) \
{ \
PASTEMAC2(chx,chy,copys)( *x1, *y1 ); \
\
x1 += incx; \
y1 += incy; \
} \
} \
} \
}
INSERT_GENTFUNC2_BASIC0( castv )
INSERT_GENTFUNC2_MIXDP0( castv )
// -----------------------------------------------------------------------------
//
// Define object-based _check() function.
//
void bli_castv_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/cast/bli_castv.h 0000664 0000000 0000000 00000004415 14634250137 0022710 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype object-based interface.
//
BLIS_EXPORT_BLIS void bli_castv
(
obj_t* x,
obj_t* y
);
//
// Prototype BLAS-like interfaces with heterogeneous-typed operands.
//
#undef GENTPROT2
#define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \
\
BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \
( \
conj_t conjx, \
dim_t n, \
void* x, inc_t incx, \
void* y, inc_t incy \
);
INSERT_GENTPROT2_BASIC0( castv )
INSERT_GENTPROT2_MIXDP0( castv )
//
// Prototype object-based _check() function.
//
void bli_castv_check
(
obj_t* x,
obj_t* y
);
cython-blis-1.0.0/blis/_src/frame/base/cast/old/ 0000775 0000000 0000000 00000000000 14634250137 0021343 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/cast/old/bli_cast_check.c 0000664 0000000 0000000 00000006426 14634250137 0024434 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_castm_check
(
obj_t* a,
obj_t* b
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
// Check structure.
// NOTE: We enforce general structure for now in order to simplify the
// implementation.
bli_check_general_object( a );
bli_check_error_code( e_val );
bli_check_general_object( b );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( a, b );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
}
void bli_castv_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/cast/old/bli_cast_check.h 0000664 0000000 0000000 00000003424 14634250137 0024434 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_castm_check
(
obj_t* a,
obj_t* b
);
void bli_castv_check
(
obj_t* x,
obj_t* y
);
cython-blis-1.0.0/blis/_src/frame/base/check/ 0000775 0000000 0000000 00000000000 14634250137 0020710 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/check/bli_obj_check.c 0000664 0000000 0000000 00000013144 14634250137 0023614 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_obj_create_check( num_t dt,
dim_t m,
dim_t n,
inc_t rs,
inc_t cs,
obj_t* obj )
{
err_t e_val;
e_val = bli_check_valid_datatype( dt );
bli_check_error_code( e_val );
e_val = bli_check_matrix_strides( m, n, rs, cs, 1 );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
void bli_obj_create_without_buffer_check( num_t dt,
dim_t m,
dim_t n,
obj_t* obj )
{
err_t e_val;
e_val = bli_check_valid_datatype( dt );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
void bli_obj_alloc_buffer_check( inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj )
{
err_t e_val;
e_val = bli_check_matrix_strides( bli_obj_length( obj ),
bli_obj_width( obj ),
rs, cs, is );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
void bli_obj_attach_buffer_check( void* p,
inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj )
{
err_t e_val;
// NOTE: We allow the caller to attach NULL to an object because
// the buffer contains NULL after _create_wihout_buffer() anyway.
// Thus, we're not opening a window for undefined behavior because
// that window is already open. Instead of checking for NULL here,
// we check the object buffers for all objects in all of the
// computational operations' _check()/_int_check() functions.
//e_val = bli_check_null_pointer( p );
//bli_check_error_code( e_val );
e_val = bli_check_matrix_strides( bli_obj_length( obj ),
bli_obj_width( obj ),
rs, cs, is );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
void bli_obj_create_scalar_check( num_t dt,
obj_t* obj )
{
err_t e_val;
e_val = bli_check_valid_datatype( dt );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
void bli_obj_free_check( obj_t* obj )
{
//err_t e_val;
// We don't bother checking for null-ness since bli_obj_free()
// handles null pointers safely.
//e_val = bli_check_null_pointer( obj );
//bli_check_error_code( e_val );
}
void bli_obj_create_const_check( double value, obj_t* obj )
{
err_t e_val;
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
#if 0
void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b )
{
err_t e_val;
e_val = bli_check_null_pointer( a );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( b );
bli_check_error_code( e_val );
e_val = bli_check_scalar_object( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
}
#endif
void bli_dt_size_check( num_t dt )
{
err_t e_val;
e_val = bli_check_valid_datatype( dt );
bli_check_error_code( e_val );
}
void bli_dt_string_check( num_t dt )
{
err_t e_val;
e_val = bli_check_nonconstant_datatype( dt );
bli_check_error_code( e_val );
}
void bli_dt_union_check( num_t dt1, num_t dt2 )
{
err_t e_val;
e_val = bli_check_floating_datatype( dt1 );
bli_check_error_code( e_val );
e_val = bli_check_floating_datatype( dt2 );
bli_check_error_code( e_val );
}
void bli_obj_print_check( char* label, obj_t* obj )
{
err_t e_val;
e_val = bli_check_null_pointer( label );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/check/bli_obj_check.h 0000664 0000000 0000000 00000005625 14634250137 0023626 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_obj_create_check( num_t dt,
dim_t m,
dim_t n,
inc_t rs,
inc_t cs,
obj_t* obj );
void bli_obj_create_without_buffer_check( num_t dt,
dim_t m,
dim_t n,
obj_t* obj );
void bli_obj_alloc_buffer_check( inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj );
void bli_obj_attach_buffer_check( void* p,
inc_t rs,
inc_t cs,
inc_t is,
obj_t* obj );
void bli_obj_create_scalar_check( num_t dt,
obj_t* obj );
void bli_obj_free_check( obj_t* obj );
void bli_obj_create_const_check( double value, obj_t* obj );
void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b );
void bli_dt_size_check( num_t dt );
void bli_dt_string_check( num_t dt );
void bli_dt_union_check( num_t dt1, num_t dt2 );
void bli_obj_print_check( char* label, obj_t* obj );
cython-blis-1.0.0/blis/_src/frame/base/check/bli_part_check.c 0000664 0000000 0000000 00000006277 14634250137 0024021 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_acquire_mpart_t2b_check( subpart_t requested_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj )
{
err_t e_val;
e_val = bli_check_valid_3x1_subpart( requested_part );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( sub_obj );
bli_check_error_code( e_val );
}
void bli_acquire_mpart_l2r_check( subpart_t requested_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj )
{
err_t e_val;
e_val = bli_check_valid_1x3_subpart( requested_part );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( sub_obj );
bli_check_error_code( e_val );
}
void bli_acquire_mpart_tl2br_check( subpart_t requested_part,
dim_t ij,
dim_t b,
obj_t* obj,
obj_t* sub_obj )
{
err_t e_val;
e_val = bli_check_valid_3x3_subpart( requested_part );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( obj );
bli_check_error_code( e_val );
e_val = bli_check_null_pointer( sub_obj );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/check/bli_part_check.h 0000664 0000000 0000000 00000004654 14634250137 0024023 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_acquire_mpart_t2b_check( subpart_t requested_part,
dim_t i,
dim_t b,
obj_t* obj,
obj_t* sub_obj );
void bli_acquire_mpart_l2r_check( subpart_t requested_part,
dim_t j,
dim_t b,
obj_t* obj,
obj_t* sub_obj );
void bli_acquire_mpart_tl2br_check( subpart_t requested_part,
dim_t ij,
dim_t b,
obj_t* obj,
obj_t* sub_obj );
cython-blis-1.0.0/blis/_src/frame/base/noopt/ 0000775 0000000 0000000 00000000000 14634250137 0020772 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/noopt/bli_dlamch.c 0000664 0000000 0000000 00000071162 14634250137 0023223 0 ustar 00root root 0000000 0000000 #include "blis.h"
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
#ifdef BLIS_ENABLE_LEGACY_LAMCH
double bli_pow_di( bla_double* a, bla_integer* n );
/* Table of constant values */
//static bla_integer c__1 = 1;
static bla_double c_b32 = 0.;
double bli_pow_di(bla_double *ap, bla_integer *bp)
{
double pow, x;
bla_integer n;
unsigned long u;
pow = 1;
x = *ap;
n = *bp;
if( n != 0 )
{
if( n < 0 )
{
n = -n;
x = 1/x;
}
for( u = n; ; )
{
if( u & 01 )
pow *= x;
if( u >>= 1 )
x *= x;
else
break;
}
}
return pow;
}
bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len)
{
/* Initialized data */
static bla_logical first = TRUE_;
/* System generated locals */
bla_integer i__1;
bla_double ret_val;
/* Builtin functions */
double bli_pow_di(bla_double *, bla_integer *);
/* Local variables */
static bla_double base;
static bla_integer beta;
static bla_double emin, prec, emax;
static bla_integer imin, imax;
static bla_logical lrnd;
static bla_double rmin, rmax, t, rmach;
extern bla_logical bli_lsame(bla_character *, bla_character *, ftnlen, ftnlen);
static bla_double smnum, sfmin;
extern /* Subroutine */ int bli_dlamc2(bla_integer *, bla_integer *, bla_logical *,
bla_double *, bla_integer *, bla_double *, bla_integer *, bla_double *);
static bla_integer it;
static bla_double rnd, eps;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DLAMCH determines double precision machine parameters. */
/* Arguments */
/* ========= */
/* CMACH (input) CHARACTER*1 */
/* Specifies the value to be returned by DLAMCH: */
/* = 'E' or 'e', DLAMCH := eps */
/* = 'S' or 's , DLAMCH := sfmin */
/* = 'B' or 'b', DLAMCH := base */
/* = 'P' or 'p', DLAMCH := eps*base */
/* = 'N' or 'n', DLAMCH := t */
/* = 'R' or 'r', DLAMCH := rnd */
/* = 'M' or 'm', DLAMCH := emin */
/* = 'U' or 'u', DLAMCH := rmin */
/* = 'L' or 'l', DLAMCH := emax */
/* = 'O' or 'o', DLAMCH := rmax */
/* where */
/* eps = relative machine precision */
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
/* base = base of the machine */
/* prec = eps*base */
/* t = number of (base) digits in the mantissa */
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
/* emin = minimum exponent before (gradual) underflow */
/* rmin = underflow threshold - base**(emin-1) */
/* emax = largest exponent before overflow */
/* rmax = overflow threshold - (base**emax)*(1-eps) */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Save statement .. */
/* .. */
/* .. Data statements .. */
/* .. */
/* .. Executable Statements .. */
if (first) {
bli_dlamc2(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
base = (bla_double) beta;
t = (bla_double) it;
if (lrnd) {
rnd = 1.;
i__1 = 1 - it;
eps = bli_pow_di(&base, &i__1) / 2;
} else {
rnd = 0.;
i__1 = 1 - it;
eps = bli_pow_di(&base, &i__1);
}
prec = eps * base;
emin = (bla_double) imin;
emax = (bla_double) imax;
sfmin = rmin;
smnum = 1. / rmax;
if (smnum >= sfmin) {
/* Use SMALL plus a bit, to avoid the possibility of rounding */
/* causing overflow when computing 1/sfmin. */
sfmin = smnum * (eps + 1.);
}
}
if (bli_lsame(cmach, "E", (ftnlen)1, (ftnlen)1)) {
rmach = eps;
} else if (bli_lsame(cmach, "S", (ftnlen)1, (ftnlen)1)) {
rmach = sfmin;
} else if (bli_lsame(cmach, "B", (ftnlen)1, (ftnlen)1)) {
rmach = base;
} else if (bli_lsame(cmach, "P", (ftnlen)1, (ftnlen)1)) {
rmach = prec;
} else if (bli_lsame(cmach, "N", (ftnlen)1, (ftnlen)1)) {
rmach = t;
} else if (bli_lsame(cmach, "R", (ftnlen)1, (ftnlen)1)) {
rmach = rnd;
} else if (bli_lsame(cmach, "M", (ftnlen)1, (ftnlen)1)) {
rmach = emin;
} else if (bli_lsame(cmach, "U", (ftnlen)1, (ftnlen)1)) {
rmach = rmin;
} else if (bli_lsame(cmach, "L", (ftnlen)1, (ftnlen)1)) {
rmach = emax;
} else if (bli_lsame(cmach, "O", (ftnlen)1, (ftnlen)1)) {
rmach = rmax;
}
ret_val = rmach;
first = FALSE_;
return ret_val;
/* End of DLAMCH */
} /* bli_dlamch_ */
/* *********************************************************************** */
/* Subroutine */ int bli_dlamc1(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_logical
*ieee1)
{
/* Initialized data */
static bla_logical first = TRUE_;
/* System generated locals */
bla_double d__1, d__2;
/* Local variables */
static bla_logical lrnd;
static bla_double a, b, c__, f;
static bla_integer lbeta;
static bla_double savec;
extern bla_double bli_dlamc3(bla_double *, bla_double *);
static bla_logical lieee1;
static bla_double t1, t2;
static bla_integer lt;
static bla_double one, qtr;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DLAMC1 determines the machine parameters given by BETA, T, RND, and */
/* IEEE1. */
/* Arguments */
/* ========= */
/* BETA (output) INTEGER */
/* The base of the machine. */
/* T (output) INTEGER */
/* The number of ( BETA ) digits in the mantissa. */
/* RND (output) LOGICAL */
/* Specifies whether proper rounding ( RND = .TRUE. ) or */
/* chopping ( RND = .FALSE. ) occurs in addition. This may not */
/* be a reliable guide to the way in which the machine performs */
/* its arithmetic. */
/* IEEE1 (output) LOGICAL */
/* Specifies whether rounding appears to be done in the IEEE */
/* 'round to nearest' style. */
/* Further Details */
/* =============== */
/* The routine is based on the routine ENVRON by Malcolm and */
/* incorporates suggestions by Gentleman and Marovich. See */
/* Malcolm M. A. (1972) Algorithms to reveal properties of */
/* floating-point arithmetic. Comms. of the ACM, 15, 949-951. */
/* Gentleman W. M. and Marovich S. B. (1974) More on algorithms */
/* that reveal properties of floating point arithmetic units. */
/* Comms. of the ACM, 17, 276-277. */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. Save statement .. */
/* .. */
/* .. Data statements .. */
/* .. */
/* .. Executable Statements .. */
if (first) {
one = 1.;
/* LBETA, LIEEE1, LT and LRND are the local values of BETA, */
/* IEEE1, T and RND. */
/* Throughout this routine we use the function DLAMC3 to ensure */
/* that relevant values are stored and not held in registers, or */
/* are not affected by optimizers. */
/* Compute a = 2.0**m with the smallest positive bla_integer m such */
/* that */
/* fl( a + 1.0 ) = a. */
a = 1.;
c__ = 1.;
/* + WHILE( C.EQ.ONE )LOOP */
L10:
if (c__ == one) {
a *= 2;
c__ = bli_dlamc3(&a, &one);
d__1 = -a;
c__ = bli_dlamc3(&c__, &d__1);
goto L10;
}
/* + END WHILE */
/* Now compute b = 2.0**m with the smallest positive bla_integer m */
/* such that */
/* fl( a + b ) .gt. a. */
b = 1.;
c__ = bli_dlamc3(&a, &b);
/* + WHILE( C.EQ.A )LOOP */
L20:
if (c__ == a) {
b *= 2;
c__ = bli_dlamc3(&a, &b);
goto L20;
}
/* + END WHILE */
/* Now compute the base. a and c are neighbouring floating point */
/* numbers in the interval ( beta**t, beta**( t + 1 ) ) and so */
/* their difference is beta. Adding 0.25 to c is to ensure that it */
/* is truncated to beta and not ( beta - 1 ). */
qtr = one / 4;
savec = c__;
d__1 = -a;
c__ = bli_dlamc3(&c__, &d__1);
lbeta = (bla_integer) (c__ + qtr);
/* Now determine whether rounding or chopping occurs, by adding a */
/* bit less than beta/2 and a bit more than beta/2 to a. */
b = (bla_double) lbeta;
d__1 = b / 2;
d__2 = -b / 100;
f = bli_dlamc3(&d__1, &d__2);
c__ = bli_dlamc3(&f, &a);
if (c__ == a) {
lrnd = TRUE_;
} else {
lrnd = FALSE_;
}
d__1 = b / 2;
d__2 = b / 100;
f = bli_dlamc3(&d__1, &d__2);
c__ = bli_dlamc3(&f, &a);
if (lrnd && c__ == a) {
lrnd = FALSE_;
}
/* Try and decide whether rounding is done in the IEEE 'round to */
/* nearest' style. B/2 is half a unit in the last place of the two */
/* numbers A and SAVEC. Furthermore, A is even, i.e. has last bit */
/* zero, and SAVEC is odd. Thus adding B/2 to A should not change */
/* A, but adding B/2 to SAVEC should change SAVEC. */
d__1 = b / 2;
t1 = bli_dlamc3(&d__1, &a);
d__1 = b / 2;
t2 = bli_dlamc3(&d__1, &savec);
lieee1 = t1 == a && t2 > savec && lrnd;
/* Now find the mantissa, t. It should be the bla_integer part of */
/* log to the base beta of a, however it is safer to determine t */
/* by powering. So we find t as the smallest positive bla_integer for */
/* which */
/* fl( beta**t + 1.0 ) = 1.0. */
lt = 0;
a = 1.;
c__ = 1.;
/* + WHILE( C.EQ.ONE )LOOP */
L30:
if (c__ == one) {
++lt;
a *= lbeta;
c__ = bli_dlamc3(&a, &one);
d__1 = -a;
c__ = bli_dlamc3(&c__, &d__1);
goto L30;
}
/* + END WHILE */
}
*beta = lbeta;
*t = lt;
*rnd = lrnd;
*ieee1 = lieee1;
first = FALSE_;
return 0;
/* End of DLAMC1 */
} /* bli_dlamc1_ */
/* *********************************************************************** */
/* Subroutine */ int bli_dlamc2(bla_integer *beta, bla_integer *t, bla_logical *rnd,
bla_double *eps, bla_integer *emin, bla_double *rmin, bla_integer *emax,
bla_double *rmax)
{
/* Initialized data */
static bla_logical first = TRUE_;
static bla_logical iwarn = FALSE_;
/* Format strings */
static bla_character fmt_9999[] = "(//\002 WARNING. The value EMIN may be incorre\
ct:-\002,\002 EMIN = \002,i8,/\002 If, after inspection, the value EMIN loo\
ks\002,\002 acceptable please comment out \002,/\002 the IF block as marked \
within the code of routine\002,\002 DLAMC2,\002,/\002 otherwise supply EMIN \
explicitly.\002,/)";
/* System generated locals */
bla_integer i__1;
bla_double d__1, d__2, d__3, d__4, d__5;
/* Builtin functions */
double bli_pow_di(bla_double *, bla_integer *);
//bla_integer s_wsfe(cilist *), do_fio(bla_integer *, bla_character *, ftnlen), e_wsfe();
/* Local variables */
static bla_logical ieee;
static bla_double half;
static bla_logical lrnd;
static bla_double leps, zero, a, b, c__;
static bla_integer i__, lbeta;
static bla_double rbase;
static bla_integer lemin, lemax, gnmin;
static bla_double smnum;
static bla_integer gpmin;
static bla_double third, lrmin, lrmax, sixth;
extern /* Subroutine */ int bli_dlamc1(bla_integer *, bla_integer *, bla_logical *,
bla_logical *);
extern bla_double bli_dlamc3(bla_double *, bla_double *);
static bla_logical lieee1;
extern /* Subroutine */ int bli_dlamc4(bla_integer *, bla_double *, bla_integer *),
bli_dlamc5(bla_integer *, bla_integer *, bla_integer *, bla_logical *, bla_integer *,
bla_double *);
static bla_integer lt, ngnmin, ngpmin;
static bla_double one, two;
/* Fortran I/O blocks */
//static cilist io___58 = { 0, 6, 0, fmt_9999, 0 };
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DLAMC2 determines the machine parameters specified in its argument */
/* list. */
/* Arguments */
/* ========= */
/* BETA (output) INTEGER */
/* The base of the machine. */
/* T (output) INTEGER */
/* The number of ( BETA ) digits in the mantissa. */
/* RND (output) LOGICAL */
/* Specifies whether proper rounding ( RND = .TRUE. ) or */
/* chopping ( RND = .FALSE. ) occurs in addition. This may not */
/* be a reliable guide to the way in which the machine performs */
/* its arithmetic. */
/* EPS (output) DOUBLE PRECISION */
/* The smallest positive number such that */
/* fl( 1.0 - EPS ) .LT. 1.0, */
/* where fl denotes the computed value. */
/* EMIN (output) INTEGER */
/* The minimum exponent before (gradual) underflow occurs. */
/* RMIN (output) DOUBLE PRECISION */
/* The smallest normalized number for the machine, given by */
/* BASE**( EMIN - 1 ), where BASE is the floating point value */
/* of BETA. */
/* EMAX (output) INTEGER */
/* The maximum exponent before overflow occurs. */
/* RMAX (output) DOUBLE PRECISION */
/* The largest positive number for the machine, given by */
/* BASE**EMAX * ( 1 - EPS ), where BASE is the floating point */
/* value of BETA. */
/* Further Details */
/* =============== */
/* The computation of EPS is based on a routine PARANOIA by */
/* W. Kahan of the University of California at Berkeley. */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Save statement .. */
/* .. */
/* .. Data statements .. */
/* .. */
/* .. Executable Statements .. */
if (first) {
zero = 0.;
one = 1.;
two = 2.;
/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of */
/* BETA, T, RND, EPS, EMIN and RMIN. */
/* Throughout this routine we use the function DLAMC3 to ensure */
/* that relevant values are stored and not held in registers, or */
/* are not affected by optimizers. */
/* DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. */
bli_dlamc1(&lbeta, <, &lrnd, &lieee1);
/* Start to find EPS. */
b = (bla_double) lbeta;
i__1 = -lt;
a = bli_pow_di(&b, &i__1);
leps = a;
/* Try some tricks to see whether or not this is the correct EPS. */
b = two / 3;
half = one / 2;
d__1 = -half;
sixth = bli_dlamc3(&b, &d__1);
third = bli_dlamc3(&sixth, &sixth);
d__1 = -half;
b = bli_dlamc3(&third, &d__1);
b = bli_dlamc3(&b, &sixth);
b = f2c_abs(b);
if (b < leps) {
b = leps;
}
leps = 1.;
/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
L10:
if (leps > b && b > zero) {
leps = b;
d__1 = half * leps;
/* Computing 5th power */
d__3 = two, d__4 = d__3, d__3 *= d__3;
/* Computing 2nd power */
d__5 = leps;
d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5);
c__ = bli_dlamc3(&d__1, &d__2);
d__1 = -c__;
c__ = bli_dlamc3(&half, &d__1);
b = bli_dlamc3(&half, &c__);
d__1 = -b;
c__ = bli_dlamc3(&half, &d__1);
b = bli_dlamc3(&half, &c__);
goto L10;
}
/* + END WHILE */
if (a < leps) {
leps = a;
}
/* Computation of EPS complete. */
/* Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). */
/* Keep dividing A by BETA until (gradual) underflow occurs. This */
/* is detected when we cannot recover the previous A. */
rbase = one / lbeta;
smnum = one;
for (i__ = 1; i__ <= 3; ++i__) {
d__1 = smnum * rbase;
smnum = bli_dlamc3(&d__1, &zero);
/* L20: */
}
a = bli_dlamc3(&one, &smnum);
bli_dlamc4(&ngpmin, &one, &lbeta);
d__1 = -one;
bli_dlamc4(&ngnmin, &d__1, &lbeta);
bli_dlamc4(&gpmin, &a, &lbeta);
d__1 = -a;
bli_dlamc4(&gnmin, &d__1, &lbeta);
ieee = FALSE_;
if (ngpmin == ngnmin && gpmin == gnmin) {
if (ngpmin == gpmin) {
lemin = ngpmin;
/* ( Non twos-complement machines, no gradual underflow; */
/* e.g., VAX ) */
} else if (gpmin - ngpmin == 3) {
lemin = ngpmin - 1 + lt;
ieee = TRUE_;
/* ( Non twos-complement machines, with gradual underflow; */
/* e.g., IEEE standard followers ) */
} else {
lemin = f2c_min(ngpmin,gpmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
} else if (ngpmin == gpmin && ngnmin == gnmin) {
if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1) {
lemin = f2c_max(ngpmin,ngnmin);
/* ( Twos-complement machines, no gradual underflow; */
/* e.g., CYBER 205 ) */
} else {
lemin = f2c_min(ngpmin,ngnmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
} else if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1 && gpmin == gnmin)
{
if (gpmin - f2c_min(ngpmin,ngnmin) == 3) {
lemin = f2c_max(ngpmin,ngnmin) - 1 + lt;
/* ( Twos-complement machines with gradual underflow; */
/* no known machine ) */
} else {
lemin = f2c_min(ngpmin,ngnmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
} else {
/* Computing MIN */
i__1 = f2c_min(ngpmin,ngnmin), i__1 = f2c_min(i__1,gpmin);
lemin = f2c_min(i__1,gnmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
first = FALSE_;
/* ** */
/* Comment out this if block if EMIN is ok */
if (iwarn) {
first = TRUE_;
/*
s_wsfe(&io___58);
do_fio(&c__1, (bla_character *)&lemin, (ftnlen)sizeof(bla_integer));
e_wsfe();
*/
printf( "%s", fmt_9999 );
}
/* ** */
/* Assume IEEE arithmetic if we found denormalised numbers above, */
/* or if arithmetic seems to round in the IEEE style, determined */
/* in routine DLAMC1. A true IEEE machine should have both things */
/* true; however, faulty machines may have one or the other. */
ieee = ieee || lieee1;
/* Compute RMIN by successive division by BETA. We could compute */
/* RMIN as BASE**( EMIN - 1 ), but some machines underflow during */
/* this computation. */
lrmin = 1.;
i__1 = 1 - lemin;
for (i__ = 1; i__ <= i__1; ++i__) {
d__1 = lrmin * rbase;
lrmin = bli_dlamc3(&d__1, &zero);
/* L30: */
}
/* Finally, call DLAMC5 to compute EMAX and RMAX. */
bli_dlamc5(&lbeta, <, &lemin, &ieee, &lemax, &lrmax);
}
*beta = lbeta;
*t = lt;
*rnd = lrnd;
*eps = leps;
*emin = lemin;
*rmin = lrmin;
*emax = lemax;
*rmax = lrmax;
return 0;
/* End of DLAMC2 */
} /* bli_dlamc2_ */
/* *********************************************************************** */
bla_double bli_dlamc3(bla_double *a, bla_double *b)
{
/* System generated locals */
bla_double ret_val;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DLAMC3 is intended to force A and B to be stored prior to doing */
/* the addition of A and B , for use in situations where optimizers */
/* might hold one of these in a register. */
/* Arguments */
/* ========= */
/* A (input) DOUBLE PRECISION */
/* B (input) DOUBLE PRECISION */
/* The values A and B. */
/* ===================================================================== */
/* .. Executable Statements .. */
ret_val = *a + *b;
return ret_val;
/* End of DLAMC3 */
} /* bli_dlamc3_ */
/* *********************************************************************** */
/* Subroutine */ int bli_dlamc4(bla_integer *emin, bla_double *start, bla_integer *base)
{
/* System generated locals */
bla_integer i__1;
bla_double d__1;
/* Local variables */
static bla_double zero, a;
static bla_integer i__;
static bla_double rbase, b1, b2, c1, c2, d1, d2;
extern bla_double bli_dlamc3(bla_double *, bla_double *);
static bla_double one;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DLAMC4 is a service routine for DLAMC2. */
/* Arguments */
/* ========= */
/* EMIN (output) INTEGER */
/* The minimum exponent before (gradual) underflow, computed by */
/* setting A = START and dividing by BASE until the previous A */
/* can not be recovered. */
/* START (input) DOUBLE PRECISION */
/* The starting point for determining EMIN. */
/* BASE (input) INTEGER */
/* The base of the machine. */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. Executable Statements .. */
a = *start;
one = 1.;
rbase = one / *base;
zero = 0.;
*emin = 1;
d__1 = a * rbase;
b1 = bli_dlamc3(&d__1, &zero);
c1 = a;
c2 = a;
d1 = a;
d2 = a;
/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. */
/* $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */
L10:
if (c1 == a && c2 == a && d1 == a && d2 == a) {
--(*emin);
a = b1;
d__1 = a / *base;
b1 = bli_dlamc3(&d__1, &zero);
d__1 = b1 * *base;
c1 = bli_dlamc3(&d__1, &zero);
d1 = zero;
i__1 = *base;
for (i__ = 1; i__ <= i__1; ++i__) {
d1 += b1;
/* L20: */
}
d__1 = a * rbase;
b2 = bli_dlamc3(&d__1, &zero);
d__1 = b2 / rbase;
c2 = bli_dlamc3(&d__1, &zero);
d2 = zero;
i__1 = *base;
for (i__ = 1; i__ <= i__1; ++i__) {
d2 += b2;
/* L30: */
}
goto L10;
}
/* + END WHILE */
return 0;
/* End of DLAMC4 */
} /* bli_dlamc4_ */
/* *********************************************************************** */
/* Subroutine */ int bli_dlamc5(bla_integer *beta, bla_integer *p, bla_integer *emin,
bla_logical *ieee, bla_integer *emax, bla_double *rmax)
{
/* System generated locals */
bla_integer i__1;
bla_double d__1;
/* Local variables */
static bla_integer lexp;
static bla_double oldy;
static bla_integer uexp, i__;
static bla_double y, z__;
static bla_integer nbits;
extern bla_double bli_dlamc3(bla_double *, bla_double *);
static bla_double recbas;
static bla_integer exbits, expsum, try__;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DLAMC5 attempts to compute RMAX, the largest machine floating-point */
/* number, without overflow. It assumes that EMAX + f2c_abs(EMIN) sum */
/* approximately to a power of 2. It will fail on machines where this */
/* assumption does not hold, for example, the Cyber 205 (EMIN = -28625, */
/* EMAX = 28718). It will also fail if the value supplied for EMIN is */
/* too large (i.e. too close to zero), probably with overflow. */
/* Arguments */
/* ========= */
/* BETA (input) INTEGER */
/* The base of floating-point arithmetic. */
/* P (input) INTEGER */
/* The number of base BETA digits in the mantissa of a */
/* floating-point value. */
/* EMIN (input) INTEGER */
/* The minimum exponent before (gradual) underflow. */
/* IEEE (input) LOGICAL */
/* A bla_logical flag specifying whether or not the arithmetic */
/* system is thought to comply with the IEEE standard. */
/* EMAX (output) INTEGER */
/* The largest exponent before overflow */
/* RMAX (output) DOUBLE PRECISION */
/* The largest machine floating-point number. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Executable Statements .. */
/* First compute LEXP and UEXP, two powers of 2 that bound */
/* f2c_abs(EMIN). We then assume that EMAX + f2c_abs(EMIN) will sum */
/* approximately to the bound that is closest to f2c_abs(EMIN). */
/* (EMAX is the exponent of the required number RMAX). */
lexp = 1;
exbits = 1;
L10:
try__ = lexp << 1;
if (try__ <= -(*emin)) {
lexp = try__;
++exbits;
goto L10;
}
if (lexp == -(*emin)) {
uexp = lexp;
} else {
uexp = try__;
++exbits;
}
/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater */
/* than or equal to EMIN. EXBITS is the number of bits needed to */
/* store the exponent. */
if (uexp + *emin > -lexp - *emin) {
expsum = lexp << 1;
} else {
expsum = uexp << 1;
}
/* EXPSUM is the exponent range, approximately equal to */
/* EMAX - EMIN + 1 . */
*emax = expsum + *emin - 1;
nbits = exbits + 1 + *p;
/* NBITS is the total number of bits needed to store a */
/* floating-point number. */
if (nbits % 2 == 1 && *beta == 2) {
/* Either there are an odd number of bits used to store a */
/* floating-point number, which is unlikely, or some bits are */
/* not used in the representation of numbers, which is possible, */
/* (e.g. Cray machines) or the mantissa has an implicit bit, */
/* (e.g. IEEE machines, Dec Vax machines), which is perhaps the */
/* most likely. We have to assume the last alternative. */
/* If this is true, then we need to reduce EMAX by one because */
/* there must be some way of representing zero in an implicit-bit */
/* system. On machines like Cray, we are reducing EMAX by one */
/* unnecessarily. */
--(*emax);
}
if (*ieee) {
/* Assume we are on an IEEE machine which reserves one exponent */
/* for infinity and NaN. */
--(*emax);
}
/* Now create RMAX, the largest machine number, which should */
/* be equal to (1.0 - BETA**(-P)) * BETA**EMAX . */
/* First compute 1.0 - BETA**(-P), being careful that the */
/* result is less than 1.0 . */
recbas = 1. / *beta;
z__ = *beta - 1.;
y = 0.;
i__1 = *p;
for (i__ = 1; i__ <= i__1; ++i__) {
z__ *= recbas;
if (y < 1.) {
oldy = y;
}
y = bli_dlamc3(&y, &z__);
/* L20: */
}
if (y >= 1.) {
y = oldy;
}
/* Now multiply by BETA**EMAX to get RMAX. */
i__1 = *emax;
for (i__ = 1; i__ <= i__1; ++i__) {
d__1 = y * *beta;
y = bli_dlamc3(&d__1, &c_b32);
/* L30: */
}
*rmax = y;
return 0;
/* End of DLAMC5 */
} /* bli_dlamc5_ */
#else
bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len)
{
/* = 'E' or 'e', DLAMCH := eps */
/* = 'S' or 's , DLAMCH := sfmin */
/* = 'B' or 'b', DLAMCH := base */
/* = 'P' or 'p', DLAMCH := eps*base */
/* = 'N' or 'n', DLAMCH := t */
/* = 'R' or 'r', DLAMCH := rnd */
/* = 'M' or 'm', DLAMCH := emin */
/* = 'U' or 'u', DLAMCH := rmin */
/* = 'L' or 'l', DLAMCH := emax */
/* = 'O' or 'o', DLAMCH := rmax */
/* where */
/* eps = relative machine precision */
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
/* base = base of the machine */
/* prec = eps*base */
/* t = number of (base) digits in the mantissa */
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
/* emin = minimum exponent before (gradual) underflow */
/* rmin = underflow threshold - base**(emin-1) */
/* emax = largest exponent before overflow */
/* rmax = overflow threshold - (base**emax)*(1-eps) */
double safe_min = DBL_MIN;
double small = 1.0f / DBL_MAX;
if ( small >= safe_min )
safe_min = small * ( 1.0 + DBL_EPSILON );
switch ( toupper( *cmach ) )
{
case 'E': return DBL_EPSILON;
case 'S': return safe_min;
case 'B': return FLT_RADIX;
case 'P': return FLT_RADIX*DBL_EPSILON;
case 'N': return DBL_MANT_DIG;
case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0 : 0.0;
case 'M': return DBL_MIN_EXP;
case 'U': return DBL_MIN;
case 'L': return DBL_MAX_EXP;
case 'O': return DBL_MAX;
}
return 0.0;
}
#endif
#ifdef __cplusplus
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/noopt/bli_dlamch.h 0000664 0000000 0000000 00000003312 14634250137 0023220 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len );
cython-blis-1.0.0/blis/_src/frame/base/noopt/bli_lsame.c 0000664 0000000 0000000 00000005355 14634250137 0023075 0 ustar 00root root 0000000 0000000 /* lsame.f -- translated by f2c (version 19991025).
You must link the resulting object file with the libraries:
-lf2c -lm (in that order)
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "blis.h"
bla_logical bli_lsame(bla_character *ca, bla_character *cb, ftnlen ca_len, ftnlen cb_len)
{
/* System generated locals */
bla_logical ret_val;
/* Local variables */
static bla_integer inta, intb, zcode;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */
/* case. */
/* Arguments */
/* ========= */
/* CA (input) CHARACTER*1 */
/* CB (input) CHARACTER*1 */
/* CA and CB specify the single bla_characters to be compared. */
/* ===================================================================== */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. Executable Statements .. */
/* Test if the bla_characters are equal */
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
if (ret_val) {
return ret_val;
}
/* Now test for equivalence if both bla_characters are alphabetic. */
zcode = 'Z';
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
/* machines, on which ICHAR returns a value with bit 8 set. */
/* ICHAR('A') on Prime machines returns 193 which is the same as */
/* ICHAR('A') on an EBCDIC machine. */
inta = *(unsigned char *)ca;
intb = *(unsigned char *)cb;
if (zcode == 90 || zcode == 122) {
/* ASCII is assumed - ZCODE is the ASCII code of either lower or */
/* upper case 'Z'. */
if (inta >= 97 && inta <= 122) {
inta += -32;
}
if (intb >= 97 && intb <= 122) {
intb += -32;
}
} else if (zcode == 233 || zcode == 169) {
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
/* upper case 'Z'. */
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta
>= 162 && inta <= 169)) {
inta += 64;
}
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb
>= 162 && intb <= 169)) {
intb += 64;
}
} else if (zcode == 218 || zcode == 250) {
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
/* plus 128 of either lower or upper case 'Z'. */
if (inta >= 225 && inta <= 250) {
inta += -32;
}
if (intb >= 225 && intb <= 250) {
intb += -32;
}
}
ret_val = inta == intb;
/* RETURN */
/* End of LSAME */
return ret_val;
} /* bli_lsame */
#ifdef __cplusplus
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/noopt/bli_lsame.h 0000664 0000000 0000000 00000003346 14634250137 0023100 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len );
cython-blis-1.0.0/blis/_src/frame/base/noopt/bli_slamch.c 0000664 0000000 0000000 00000071053 14634250137 0023241 0 ustar 00root root 0000000 0000000 #include "blis.h"
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
#ifdef BLIS_ENABLE_LEGACY_LAMCH
double bli_pow_ri( bla_real* a, bla_integer* n );
/* Table of constant values */
//static bla_integer c__1 = 1;
static bla_real c_b32 = (float)0.;
double bli_pow_ri(bla_real *ap, bla_integer *bp)
{
double pow, x;
bla_integer n;
unsigned long u;
pow = 1;
x = *ap;
n = *bp;
if( n != 0 )
{
if( n < 0 )
{
n = -n;
x = 1/x;
}
for( u = n; ; )
{
if( u & 01 )
pow *= x;
if( u >>= 1 )
x *= x;
else
break;
}
}
return pow;
}
bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len)
{
/* Initialized data */
static bla_logical first = TRUE_;
/* System generated locals */
bla_integer i__1;
bla_real ret_val;
/* Builtin functions */
double bli_pow_ri(bla_real *, bla_integer *);
/* Local variables */
static bla_real base;
static bla_integer beta;
static bla_real emin, prec, emax;
static bla_integer imin, imax;
static bla_logical lrnd;
static bla_real rmin, rmax, t, rmach;
extern bla_logical bli_lsame(bla_character *, bla_character *, ftnlen, ftnlen);
static bla_real smnum, sfmin;
extern /* Subroutine */ int bli_slamc2(bla_integer *, bla_integer *, bla_logical *, bla_real
*, bla_integer *, bla_real *, bla_integer *, bla_real *);
static bla_integer it;
static bla_real rnd, eps;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SLAMCH determines single precision machine parameters. */
/* Arguments */
/* ========= */
/* CMACH (input) CHARACTER*1 */
/* Specifies the value to be returned by SLAMCH: */
/* = 'E' or 'e', SLAMCH := eps */
/* = 'S' or 's , SLAMCH := sfmin */
/* = 'B' or 'b', SLAMCH := base */
/* = 'P' or 'p', SLAMCH := eps*base */
/* = 'N' or 'n', SLAMCH := t */
/* = 'R' or 'r', SLAMCH := rnd */
/* = 'M' or 'm', SLAMCH := emin */
/* = 'U' or 'u', SLAMCH := rmin */
/* = 'L' or 'l', SLAMCH := emax */
/* = 'O' or 'o', SLAMCH := rmax */
/* where */
/* eps = relative machine precision */
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
/* base = base of the machine */
/* prec = eps*base */
/* t = number of (base) digits in the mantissa */
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
/* emin = minimum exponent before (gradual) underflow */
/* rmin = underflow threshold - base**(emin-1) */
/* emax = largest exponent before overflow */
/* rmax = overflow threshold - (base**emax)*(1-eps) */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Save statement .. */
/* .. */
/* .. Data statements .. */
/* .. */
/* .. Executable Statements .. */
if (first) {
bli_slamc2(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
base = (bla_real) beta;
t = (bla_real) it;
if (lrnd) {
rnd = (float)1.;
i__1 = 1 - it;
eps = bli_pow_ri(&base, &i__1) / 2;
} else {
rnd = (float)0.;
i__1 = 1 - it;
eps = bli_pow_ri(&base, &i__1);
}
prec = eps * base;
emin = (bla_real) imin;
emax = (bla_real) imax;
sfmin = rmin;
smnum = (float)1. / rmax;
if (smnum >= sfmin) {
/* Use SMALL plus a bit, to avoid the possibility of rounding */
/* causing overflow when computing 1/sfmin. */
sfmin = smnum * (eps + (float)1.);
}
}
if (bli_lsame(cmach, "E", (ftnlen)1, (ftnlen)1)) {
rmach = eps;
} else if (bli_lsame(cmach, "S", (ftnlen)1, (ftnlen)1)) {
rmach = sfmin;
} else if (bli_lsame(cmach, "B", (ftnlen)1, (ftnlen)1)) {
rmach = base;
} else if (bli_lsame(cmach, "P", (ftnlen)1, (ftnlen)1)) {
rmach = prec;
} else if (bli_lsame(cmach, "N", (ftnlen)1, (ftnlen)1)) {
rmach = t;
} else if (bli_lsame(cmach, "R", (ftnlen)1, (ftnlen)1)) {
rmach = rnd;
} else if (bli_lsame(cmach, "M", (ftnlen)1, (ftnlen)1)) {
rmach = emin;
} else if (bli_lsame(cmach, "U", (ftnlen)1, (ftnlen)1)) {
rmach = rmin;
} else if (bli_lsame(cmach, "L", (ftnlen)1, (ftnlen)1)) {
rmach = emax;
} else if (bli_lsame(cmach, "O", (ftnlen)1, (ftnlen)1)) {
rmach = rmax;
}
ret_val = rmach;
first = FALSE_;
return ret_val;
/* End of SLAMCH */
} /* bli_slamch_ */
/* *********************************************************************** */
/* Subroutine */ int bli_slamc1(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_logical
*ieee1)
{
/* Initialized data */
static bla_logical first = TRUE_;
/* System generated locals */
bla_real r__1, r__2;
/* Local variables */
static bla_logical lrnd;
static bla_real a, b, c__, f;
static bla_integer lbeta;
static bla_real savec;
static bla_logical lieee1;
static bla_real t1, t2;
extern bla_real bli_slamc3(bla_real *, bla_real *);
static bla_integer lt;
static bla_real one, qtr;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SLAMC1 determines the machine parameters given by BETA, T, RND, and */
/* IEEE1. */
/* Arguments */
/* ========= */
/* BETA (output) INTEGER */
/* The base of the machine. */
/* T (output) INTEGER */
/* The number of ( BETA ) digits in the mantissa. */
/* RND (output) LOGICAL */
/* Specifies whether proper rounding ( RND = .TRUE. ) or */
/* chopping ( RND = .FALSE. ) occurs in addition. This may not */
/* be a reliable guide to the way in which the machine performs */
/* its arithmetic. */
/* IEEE1 (output) LOGICAL */
/* Specifies whether rounding appears to be done in the IEEE */
/* 'round to nearest' style. */
/* Further Details */
/* =============== */
/* The routine is based on the routine ENVRON by Malcolm and */
/* incorporates suggestions by Gentleman and Marovich. See */
/* Malcolm M. A. (1972) Algorithms to reveal properties of */
/* floating-point arithmetic. Comms. of the ACM, 15, 949-951. */
/* Gentleman W. M. and Marovich S. B. (1974) More on algorithms */
/* that reveal properties of floating point arithmetic units. */
/* Comms. of the ACM, 17, 276-277. */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. Save statement .. */
/* .. */
/* .. Data statements .. */
/* .. */
/* .. Executable Statements .. */
if (first) {
one = (float)1.;
/* LBETA, LIEEE1, LT and LRND are the local values of BETA, */
/* IEEE1, T and RND. */
/* Throughout this routine we use the function SLAMC3 to ensure */
/* that relevant values are stored and not held in registers, or */
/* are not affected by optimizers. */
/* Compute a = 2.0**m with the smallest positive bla_integer m such */
/* that */
/* fl( a + 1.0 ) = a. */
a = (float)1.;
c__ = (float)1.;
/* + WHILE( C.EQ.ONE )LOOP */
L10:
if (c__ == one) {
a *= 2;
c__ = bli_slamc3(&a, &one);
r__1 = -a;
c__ = bli_slamc3(&c__, &r__1);
goto L10;
}
/* + END WHILE */
/* Now compute b = 2.0**m with the smallest positive bla_integer m */
/* such that */
/* fl( a + b ) .gt. a. */
b = (float)1.;
c__ = bli_slamc3(&a, &b);
/* + WHILE( C.EQ.A )LOOP */
L20:
if (c__ == a) {
b *= 2;
c__ = bli_slamc3(&a, &b);
goto L20;
}
/* + END WHILE */
/* Now compute the base. a and c are neighbouring floating point */
/* numbers in the interval ( beta**t, beta**( t + 1 ) ) and so */
/* their difference is beta. Adding 0.25 to c is to ensure that it */
/* is truncated to beta and not ( beta - 1 ). */
qtr = one / 4;
savec = c__;
r__1 = -a;
c__ = bli_slamc3(&c__, &r__1);
lbeta = c__ + qtr;
/* Now determine whether rounding or chopping occurs, by adding a */
/* bit less than beta/2 and a bit more than beta/2 to a. */
b = (bla_real) lbeta;
r__1 = b / 2;
r__2 = -b / 100;
f = bli_slamc3(&r__1, &r__2);
c__ = bli_slamc3(&f, &a);
if (c__ == a) {
lrnd = TRUE_;
} else {
lrnd = FALSE_;
}
r__1 = b / 2;
r__2 = b / 100;
f = bli_slamc3(&r__1, &r__2);
c__ = bli_slamc3(&f, &a);
if (lrnd && c__ == a) {
lrnd = FALSE_;
}
/* Try and decide whether rounding is done in the IEEE 'round to */
/* nearest' style. B/2 is half a unit in the last place of the two */
/* numbers A and SAVEC. Furthermore, A is even, i.e. has last bit */
/* zero, and SAVEC is odd. Thus adding B/2 to A should not change */
/* A, but adding B/2 to SAVEC should change SAVEC. */
r__1 = b / 2;
t1 = bli_slamc3(&r__1, &a);
r__1 = b / 2;
t2 = bli_slamc3(&r__1, &savec);
lieee1 = t1 == a && t2 > savec && lrnd;
/* Now find the mantissa, t. It should be the bla_integer part of */
/* log to the base beta of a, however it is safer to determine t */
/* by powering. So we find t as the smallest positive bla_integer for */
/* which */
/* fl( beta**t + 1.0 ) = 1.0. */
lt = 0;
a = (float)1.;
c__ = (float)1.;
/* + WHILE( C.EQ.ONE )LOOP */
L30:
if (c__ == one) {
++lt;
a *= lbeta;
c__ = bli_slamc3(&a, &one);
r__1 = -a;
c__ = bli_slamc3(&c__, &r__1);
goto L30;
}
/* + END WHILE */
}
*beta = lbeta;
*t = lt;
*rnd = lrnd;
*ieee1 = lieee1;
first = FALSE_;
return 0;
/* End of SLAMC1 */
} /* bli_slamc1_ */
/* *********************************************************************** */
/* Subroutine */ int bli_slamc2(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_real *
eps, bla_integer *emin, bla_real *rmin, bla_integer *emax, bla_real *rmax)
{
/* Initialized data */
static bla_logical first = TRUE_;
static bla_logical iwarn = FALSE_;
/* Format strings */
static bla_character fmt_9999[] = "(//\002 WARNING. The value EMIN may be incorre\
ct:-\002,\002 EMIN = \002,i8,/\002 If, after inspection, the value EMIN loo\
ks\002,\002 acceptable please comment out \002,/\002 the IF block as marked \
within the code of routine\002,\002 SLAMC2,\002,/\002 otherwise supply EMIN \
explicitly.\002,/)";
/* System generated locals */
bla_integer i__1;
bla_real r__1, r__2, r__3, r__4, r__5;
/* Builtin functions */
double bli_pow_ri(bla_real *, bla_integer *);
//bla_integer s_wsfe(cilist *), do_fio(bla_integer *, bla_character *, ftnlen), e_wsfe();
/* Local variables */
static bla_logical ieee;
static bla_real half;
static bla_logical lrnd;
static bla_real leps, zero, a, b, c__;
static bla_integer i__, lbeta;
static bla_real rbase;
static bla_integer lemin, lemax, gnmin;
static bla_real smnum;
static bla_integer gpmin;
static bla_real third, lrmin, lrmax, sixth;
static bla_logical lieee1;
extern /* Subroutine */ int bli_slamc1(bla_integer *, bla_integer *, bla_logical *,
bla_logical *);
extern bla_real bli_slamc3(bla_real *, bla_real *);
extern /* Subroutine */ int bli_slamc4(bla_integer *, bla_real *, bla_integer *),
bli_slamc5(bla_integer *, bla_integer *, bla_integer *, bla_logical *, bla_integer *,
bla_real *);
static bla_integer lt, ngnmin, ngpmin;
static bla_real one, two;
/* Fortran I/O blocks */
//static cilist io___58 = { 0, 6, 0, fmt_9999, 0 };
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SLAMC2 determines the machine parameters specified in its argument */
/* list. */
/* Arguments */
/* ========= */
/* BETA (output) INTEGER */
/* The base of the machine. */
/* T (output) INTEGER */
/* The number of ( BETA ) digits in the mantissa. */
/* RND (output) LOGICAL */
/* Specifies whether proper rounding ( RND = .TRUE. ) or */
/* chopping ( RND = .FALSE. ) occurs in addition. This may not */
/* be a reliable guide to the way in which the machine performs */
/* its arithmetic. */
/* EPS (output) REAL */
/* The smallest positive number such that */
/* fl( 1.0 - EPS ) .LT. 1.0, */
/* where fl denotes the computed value. */
/* EMIN (output) INTEGER */
/* The minimum exponent before (gradual) underflow occurs. */
/* RMIN (output) REAL */
/* The smallest normalized number for the machine, given by */
/* BASE**( EMIN - 1 ), where BASE is the floating point value */
/* of BETA. */
/* EMAX (output) INTEGER */
/* The maximum exponent before overflow occurs. */
/* RMAX (output) REAL */
/* The largest positive number for the machine, given by */
/* BASE**EMAX * ( 1 - EPS ), where BASE is the floating point */
/* value of BETA. */
/* Further Details */
/* =============== */
/* The computation of EPS is based on a routine PARANOIA by */
/* W. Kahan of the University of California at Berkeley. */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Save statement .. */
/* .. */
/* .. Data statements .. */
/* .. */
/* .. Executable Statements .. */
if (first) {
zero = (float)0.;
one = (float)1.;
two = (float)2.;
/* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of */
/* BETA, T, RND, EPS, EMIN and RMIN. */
/* Throughout this routine we use the function SLAMC3 to ensure */
/* that relevant values are stored and not held in registers, or */
/* are not affected by optimizers. */
/* SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. */
bli_slamc1(&lbeta, <, &lrnd, &lieee1);
/* Start to find EPS. */
b = (bla_real) lbeta;
i__1 = -lt;
a = bli_pow_ri(&b, &i__1);
leps = a;
/* Try some tricks to see whether or not this is the correct EPS. */
b = two / 3;
half = one / 2;
r__1 = -half;
sixth = bli_slamc3(&b, &r__1);
third = bli_slamc3(&sixth, &sixth);
r__1 = -half;
b = bli_slamc3(&third, &r__1);
b = bli_slamc3(&b, &sixth);
b = f2c_abs(b);
if (b < leps) {
b = leps;
}
leps = (float)1.;
/* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */
L10:
if (leps > b && b > zero) {
leps = b;
r__1 = half * leps;
/* Computing 5th power */
r__3 = two, r__4 = r__3, r__3 *= r__3;
/* Computing 2nd power */
r__5 = leps;
r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5);
c__ = bli_slamc3(&r__1, &r__2);
r__1 = -c__;
c__ = bli_slamc3(&half, &r__1);
b = bli_slamc3(&half, &c__);
r__1 = -b;
c__ = bli_slamc3(&half, &r__1);
b = bli_slamc3(&half, &c__);
goto L10;
}
/* + END WHILE */
if (a < leps) {
leps = a;
}
/* Computation of EPS complete. */
/* Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). */
/* Keep dividing A by BETA until (gradual) underflow occurs. This */
/* is detected when we cannot recover the previous A. */
rbase = one / lbeta;
smnum = one;
for (i__ = 1; i__ <= 3; ++i__) {
r__1 = smnum * rbase;
smnum = bli_slamc3(&r__1, &zero);
/* L20: */
}
a = bli_slamc3(&one, &smnum);
bli_slamc4(&ngpmin, &one, &lbeta);
r__1 = -one;
bli_slamc4(&ngnmin, &r__1, &lbeta);
bli_slamc4(&gpmin, &a, &lbeta);
r__1 = -a;
bli_slamc4(&gnmin, &r__1, &lbeta);
ieee = FALSE_;
if (ngpmin == ngnmin && gpmin == gnmin) {
if (ngpmin == gpmin) {
lemin = ngpmin;
/* ( Non twos-complement machines, no gradual underflow; */
/* e.g., VAX ) */
} else if (gpmin - ngpmin == 3) {
lemin = ngpmin - 1 + lt;
ieee = TRUE_;
/* ( Non twos-complement machines, with gradual underflow; */
/* e.g., IEEE standard followers ) */
} else {
lemin = f2c_min(ngpmin,gpmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
} else if (ngpmin == gpmin && ngnmin == gnmin) {
if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1) {
lemin = f2c_max(ngpmin,ngnmin);
/* ( Twos-complement machines, no gradual underflow; */
/* e.g., CYBER 205 ) */
} else {
lemin = f2c_min(ngpmin,ngnmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
} else if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1 && gpmin == gnmin)
{
if (gpmin - f2c_min(ngpmin,ngnmin) == 3) {
lemin = f2c_max(ngpmin,ngnmin) - 1 + lt;
/* ( Twos-complement machines with gradual underflow; */
/* no known machine ) */
} else {
lemin = f2c_min(ngpmin,ngnmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
} else {
/* Computing MIN */
i__1 = f2c_min(ngpmin,ngnmin), i__1 = f2c_min(i__1,gpmin);
lemin = f2c_min(i__1,gnmin);
/* ( A guess; no known machine ) */
iwarn = TRUE_;
}
first = FALSE_;
/* ** */
/* Comment out this if block if EMIN is ok */
if (iwarn) {
first = TRUE_;
/*
s_wsfe(&io___58);
do_fio(&c__1, (bla_character *)&lemin, (ftnlen)sizeof(bla_integer));
e_wsfe();
*/
printf( "%s", fmt_9999 );
}
/* ** */
/* Assume IEEE arithmetic if we found denormalised numbers above, */
/* or if arithmetic seems to round in the IEEE style, determined */
/* in routine SLAMC1. A true IEEE machine should have both things */
/* true; however, faulty machines may have one or the other. */
ieee = ieee || lieee1;
/* Compute RMIN by successive division by BETA. We could compute */
/* RMIN as BASE**( EMIN - 1 ), but some machines underflow during */
/* this computation. */
lrmin = (float)1.;
i__1 = 1 - lemin;
for (i__ = 1; i__ <= i__1; ++i__) {
r__1 = lrmin * rbase;
lrmin = bli_slamc3(&r__1, &zero);
/* L30: */
}
/* Finally, call SLAMC5 to compute EMAX and RMAX. */
bli_slamc5(&lbeta, <, &lemin, &ieee, &lemax, &lrmax);
}
*beta = lbeta;
*t = lt;
*rnd = lrnd;
*eps = leps;
*emin = lemin;
*rmin = lrmin;
*emax = lemax;
*rmax = lrmax;
return 0;
/* End of SLAMC2 */
} /* bli_slamc2_ */
/* *********************************************************************** */
bla_real bli_slamc3(bla_real *a, bla_real *b)
{
/* System generated locals */
bla_real ret_val;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SLAMC3 is intended to force A and B to be stored prior to doing */
/* the addition of A and B , for use in situations where optimizers */
/* might hold one of these in a register. */
/* Arguments */
/* ========= */
/* A (input) REAL */
/* B (input) REAL */
/* The values A and B. */
/* ===================================================================== */
/* .. Executable Statements .. */
ret_val = *a + *b;
return ret_val;
/* End of SLAMC3 */
} /* bli_slamc3_ */
/* *********************************************************************** */
/* Subroutine */ int bli_slamc4(bla_integer *emin, bla_real *start, bla_integer *base)
{
/* System generated locals */
bla_integer i__1;
bla_real r__1;
/* Local variables */
static bla_real zero, a;
static bla_integer i__;
static bla_real rbase, b1, b2, c1, c2, d1, d2;
extern bla_real bli_slamc3(bla_real *, bla_real *);
static bla_real one;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SLAMC4 is a service routine for SLAMC2. */
/* Arguments */
/* ========= */
/* EMIN (output) INTEGER */
/* The minimum exponent before (gradual) underflow, computed by */
/* setting A = START and dividing by BASE until the previous A */
/* can not be recovered. */
/* START (input) REAL */
/* The starting point for determining EMIN. */
/* BASE (input) INTEGER */
/* The base of the machine. */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. Executable Statements .. */
a = *start;
one = (float)1.;
rbase = one / *base;
zero = (float)0.;
*emin = 1;
r__1 = a * rbase;
b1 = bli_slamc3(&r__1, &zero);
c1 = a;
c2 = a;
d1 = a;
d2 = a;
/* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. */
/* $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */
L10:
if (c1 == a && c2 == a && d1 == a && d2 == a) {
--(*emin);
a = b1;
r__1 = a / *base;
b1 = bli_slamc3(&r__1, &zero);
r__1 = b1 * *base;
c1 = bli_slamc3(&r__1, &zero);
d1 = zero;
i__1 = *base;
for (i__ = 1; i__ <= i__1; ++i__) {
d1 += b1;
/* L20: */
}
r__1 = a * rbase;
b2 = bli_slamc3(&r__1, &zero);
r__1 = b2 / rbase;
c2 = bli_slamc3(&r__1, &zero);
d2 = zero;
i__1 = *base;
for (i__ = 1; i__ <= i__1; ++i__) {
d2 += b2;
/* L30: */
}
goto L10;
}
/* + END WHILE */
return 0;
/* End of SLAMC4 */
} /* bli_slamc4_ */
/* *********************************************************************** */
/* Subroutine */ int bli_slamc5(bla_integer *beta, bla_integer *p, bla_integer *emin,
bla_logical *ieee, bla_integer *emax, bla_real *rmax)
{
/* System generated locals */
bla_integer i__1;
bla_real r__1;
/* Local variables */
static bla_integer lexp;
static bla_real oldy;
static bla_integer uexp, i__;
static bla_real y, z__;
static bla_integer nbits;
extern bla_real bli_slamc3(bla_real *, bla_real *);
static bla_real recbas;
static bla_integer exbits, expsum, try__;
/* -- LAPACK auxiliary routine (version 3.2) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SLAMC5 attempts to compute RMAX, the largest machine floating-point */
/* number, without overflow. It assumes that EMAX + f2c_abs(EMIN) sum */
/* approximately to a power of 2. It will fail on machines where this */
/* assumption does not hold, for example, the Cyber 205 (EMIN = -28625, */
/* EMAX = 28718). It will also fail if the value supplied for EMIN is */
/* too large (i.e. too close to zero), probably with overflow. */
/* Arguments */
/* ========= */
/* BETA (input) INTEGER */
/* The base of floating-point arithmetic. */
/* P (input) INTEGER */
/* The number of base BETA digits in the mantissa of a */
/* floating-point value. */
/* EMIN (input) INTEGER */
/* The minimum exponent before (gradual) underflow. */
/* IEEE (input) LOGICAL */
/* A bla_logical flag specifying whether or not the arithmetic */
/* system is thought to comply with the IEEE standard. */
/* EMAX (output) INTEGER */
/* The largest exponent before overflow */
/* RMAX (output) REAL */
/* The largest machine floating-point number. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Executable Statements .. */
/* First compute LEXP and UEXP, two powers of 2 that bound */
/* f2c_abs(EMIN). We then assume that EMAX + f2c_abs(EMIN) will sum */
/* approximately to the bound that is closest to f2c_abs(EMIN). */
/* (EMAX is the exponent of the required number RMAX). */
lexp = 1;
exbits = 1;
L10:
try__ = lexp << 1;
if (try__ <= -(*emin)) {
lexp = try__;
++exbits;
goto L10;
}
if (lexp == -(*emin)) {
uexp = lexp;
} else {
uexp = try__;
++exbits;
}
/* Now -LEXP is less than or equal to EMIN, and -UEXP is greater */
/* than or equal to EMIN. EXBITS is the number of bits needed to */
/* store the exponent. */
if (uexp + *emin > -lexp - *emin) {
expsum = lexp << 1;
} else {
expsum = uexp << 1;
}
/* EXPSUM is the exponent range, approximately equal to */
/* EMAX - EMIN + 1 . */
*emax = expsum + *emin - 1;
nbits = exbits + 1 + *p;
/* NBITS is the total number of bits needed to store a */
/* floating-point number. */
if (nbits % 2 == 1 && *beta == 2) {
/* Either there are an odd number of bits used to store a */
/* floating-point number, which is unlikely, or some bits are */
/* not used in the representation of numbers, which is possible, */
/* (e.g. Cray machines) or the mantissa has an implicit bit, */
/* (e.g. IEEE machines, Dec Vax machines), which is perhaps the */
/* most likely. We have to assume the last alternative. */
/* If this is true, then we need to reduce EMAX by one because */
/* there must be some way of representing zero in an implicit-bit */
/* system. On machines like Cray, we are reducing EMAX by one */
/* unnecessarily. */
--(*emax);
}
if (*ieee) {
/* Assume we are on an IEEE machine which reserves one exponent */
/* for infinity and NaN. */
--(*emax);
}
/* Now create RMAX, the largest machine number, which should */
/* be equal to (1.0 - BETA**(-P)) * BETA**EMAX . */
/* First compute 1.0 - BETA**(-P), being careful that the */
/* result is less than 1.0 . */
recbas = (float)1. / *beta;
z__ = *beta - (float)1.;
y = (float)0.;
i__1 = *p;
for (i__ = 1; i__ <= i__1; ++i__) {
z__ *= recbas;
if (y < (float)1.) {
oldy = y;
}
y = bli_slamc3(&y, &z__);
/* L20: */
}
if (y >= (float)1.) {
y = oldy;
}
/* Now multiply by BETA**EMAX to get RMAX. */
i__1 = *emax;
for (i__ = 1; i__ <= i__1; ++i__) {
r__1 = y * *beta;
y = bli_slamc3(&r__1, &c_b32);
/* L30: */
}
*rmax = y;
return 0;
/* End of SLAMC5 */
} /* bli_slamc5_ */
#else
bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len)
{
/* = 'E' or 'e', SLAMCH := eps */
/* = 'S' or 's , SLAMCH := sfmin */
/* = 'B' or 'b', SLAMCH := base */
/* = 'P' or 'p', SLAMCH := eps*base */
/* = 'N' or 'n', SLAMCH := t */
/* = 'R' or 'r', SLAMCH := rnd */
/* = 'M' or 'm', SLAMCH := emin */
/* = 'U' or 'u', SLAMCH := rmin */
/* = 'L' or 'l', SLAMCH := emax */
/* = 'O' or 'o', SLAMCH := rmax */
/* where */
/* eps = relative machine precision */
/* sfmin = safe minimum, such that 1/sfmin does not overflow */
/* base = base of the machine */
/* prec = eps*base */
/* t = number of (base) digits in the mantissa */
/* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */
/* emin = minimum exponent before (gradual) underflow */
/* rmin = underflow threshold - base**(emin-1) */
/* emax = largest exponent before overflow */
/* rmax = overflow threshold - (base**emax)*(1-eps) */
float safe_min = FLT_MIN;
float small = 1.0f / FLT_MAX;
if ( small >= safe_min )
safe_min = small * ( 1.0f + FLT_EPSILON );
switch ( toupper( *cmach ) )
{
case 'E': return FLT_EPSILON;
case 'S': return safe_min;
case 'B': return FLT_RADIX;
case 'P': return FLT_RADIX*FLT_EPSILON;
case 'N': return FLT_MANT_DIG;
case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0f : 0.0f;
case 'M': return FLT_MIN_EXP;
case 'U': return FLT_MIN;
case 'L': return FLT_MAX_EXP;
case 'O': return FLT_MAX;
}
return 0.0f;
}
#endif
#ifdef __cplusplus
}
#endif
cython-blis-1.0.0/blis/_src/frame/base/noopt/bli_slamch.h 0000664 0000000 0000000 00000003310 14634250137 0023235 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len );
cython-blis-1.0.0/blis/_src/frame/base/proj/ 0000775 0000000 0000000 00000000000 14634250137 0020605 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/proj/bli_projm.c 0000664 0000000 0000000 00000007116 14634250137 0022733 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_projm
(
obj_t* a,
obj_t* b
)
{
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_projm_check( a, b );
if ( ( bli_obj_is_real( a ) && bli_obj_is_real( b ) ) ||
( bli_obj_is_complex( a ) && bli_obj_is_complex( b ) ) )
{
// If a and b are both real or both complex, we can simply use
// copym.
bli_copym( a, b );
}
else
{
// This branch handles the case where one operand is real and
// the other is complex.
if ( bli_obj_is_real( a ) /* && bli_obj_is_complex( b ) */ )
{
// If a is real and b is complex, we must obtain the real part
// of b so that we can copy a into the real part (after
// initializing all of b, including imaginary components, to
// zero).
obj_t br;
bli_obj_real_part( b, &br );
bli_setm( &BLIS_ZERO, b );
bli_copym( a, &br );
}
else // bli_obj_is_complex( a ) && bli_obj_is_real( b )
{
// If a is complex and b is real, we can simply copy the
// real part of a into b.
obj_t ar;
bli_obj_real_part( a, &ar );
bli_copym( &ar, b );
}
}
}
// -----------------------------------------------------------------------------
void bli_projm_check
(
obj_t* a,
obj_t* b
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_precisions( a, b );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( a, b );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/proj/bli_projm.h 0000664 0000000 0000000 00000003440 14634250137 0022734 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS void bli_projm
(
obj_t* a,
obj_t* b
);
void bli_projm_check
(
obj_t* a,
obj_t* b
);
cython-blis-1.0.0/blis/_src/frame/base/proj/bli_projv.c 0000664 0000000 0000000 00000007124 14634250137 0022743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_projv
(
obj_t* x,
obj_t* y
)
{
// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_projv_check( x, y );
if ( ( bli_obj_is_real( x ) && bli_obj_is_real( y ) ) ||
( bli_obj_is_complex( x ) && bli_obj_is_complex( y ) ) )
{
// If x and y are both real or both complex, we can simply use
// copyv.
bli_copyv( x, y );
}
else
{
// This branch handles the case where one operand is real and
// the other is complex.
if ( bli_obj_is_real( x ) /* && bli_obj_is_complex( y ) */ )
{
// If x is real and y is complex, we must obtain the real part
// of y so that we can copy x into the real part (after
// initializing all of y, including imaginary components, to
// zero).
obj_t yr;
bli_obj_real_part( y, &yr );
bli_setv( &BLIS_ZERO, y );
bli_copyv( x, &yr );
}
else // bli_obj_is_complex( x ) && bli_obj_is_real( y )
{
// If x is complex and y is real, we can simply copy the
// real part of x into y.
obj_t xr;
bli_obj_real_part( x, &xr );
bli_copyv( &xr, y );
}
}
}
// -----------------------------------------------------------------------------
void bli_projv_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_precisions( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/proj/bli_projv.h 0000664 0000000 0000000 00000003440 14634250137 0022745 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
BLIS_EXPORT_BLIS void bli_projv
(
obj_t* x,
obj_t* y
);
void bli_projv_check
(
obj_t* x,
obj_t* y
);
cython-blis-1.0.0/blis/_src/frame/base/proj/old/ 0000775 0000000 0000000 00000000000 14634250137 0021363 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/base/proj/old/bli_proj_check.c 0000664 0000000 0000000 00000006326 14634250137 0024473 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
void bli_projm_check
(
obj_t* a,
obj_t* b
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( a );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( b );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_precisions( a, b );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_matrix_object( a );
bli_check_error_code( e_val );
e_val = bli_check_matrix_object( b );
bli_check_error_code( e_val );
e_val = bli_check_conformal_dims( a, b );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( a );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( b );
bli_check_error_code( e_val );
}
void bli_projv_check
(
obj_t* x,
obj_t* y
)
{
err_t e_val;
// Check object datatypes.
e_val = bli_check_floating_object( x );
bli_check_error_code( e_val );
e_val = bli_check_floating_object( y );
bli_check_error_code( e_val );
e_val = bli_check_consistent_object_precisions( x, y );
bli_check_error_code( e_val );
// Check object dimensions.
e_val = bli_check_vector_object( x );
bli_check_error_code( e_val );
e_val = bli_check_vector_object( y );
bli_check_error_code( e_val );
e_val = bli_check_equal_vector_lengths( x, y );
bli_check_error_code( e_val );
// Check object buffers (for non-NULLness).
e_val = bli_check_object_buffer( x );
bli_check_error_code( e_val );
e_val = bli_check_object_buffer( y );
bli_check_error_code( e_val );
}
cython-blis-1.0.0/blis/_src/frame/base/proj/old/bli_proj_check.h 0000664 0000000 0000000 00000003424 14634250137 0024474 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
void bli_projm_check
(
obj_t* a,
obj_t* b
);
void bli_projv_check
(
obj_t* x,
obj_t* y
);
cython-blis-1.0.0/blis/_src/frame/compat/ 0000775 0000000 0000000 00000000000 14634250137 0020204 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/compat/amd/ 0000775 0000000 0000000 00000000000 14634250137 0020745 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/compat/amd/bla_copy_amd.c 0000664 0000000 0000000 00000011116 14634250137 0023522 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
/*bli_init_auto()*/; \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
/* NOTE: While we skip explicit initialization for real domain instances
since we call the microkernel directly, the complex domain instances
still need initialization so that they can query valid contexts from
gks. However, the expert API will self-initialize before attempting
to query a context, so the complex domain cases should work fine. */ \
PASTEMAC2(ch,blisname,isuf) \
( \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
NULL \
); \
\
/* Finalize BLIS. */ \
/*bli_finalize_auto();*/ \
}
#ifdef BLIS_ENABLE_BLAS
//INSERT_GENTFUNC_BLAS( copy, copyv )
GENTFUNC( float, s, copy, copyv, _zen_int )
GENTFUNC( double, d, copy, copyv, _zen_int )
#endif
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
/*bli_init_auto()*/; \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
/* NOTE: While we skip explicit initialization for real domain instances
since we call the microkernel directly, the complex domain instances
still need initialization so that they can query valid contexts from
gks. However, the expert API will self-initialize before attempting
to query a context, so the complex domain cases should work fine. */ \
PASTEMAC2(ch,blisname,isuf) \
( \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
/*bli_finalize_auto();*/ \
}
#ifdef BLIS_ENABLE_BLAS
//INSERT_GENTFUNC_BLAS( copy, copyv )
GENTFUNC( scomplex, c, copy, copyv, _ex )
GENTFUNC( dcomplex, z, copy, copyv, _ex )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/amd/bla_gemv_amd.c 0000664 0000000 0000000 00000013066 14634250137 0023514 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
) \
{ \
trans_t blis_transa; \
dim_t m0, n0; \
dim_t m_y, n_x; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
/*bli_init_auto();*/ \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
m, \
n, \
lda, \
incx, \
incy \
); \
\
/* BLAS handles cases where y has no elements as well as those where x has
no elements. In the case of the former, it cannot do any work since
the output vector is empty; but in the latter case, BLAS has peculiar
semantics. When x has no elements (and transa(A) has no columns), BLAS
returns immediately without performing any computation even if the
number of elements of y (and rows of transa(A)) is non-zero, in which
case any sane interpretations of gemv would have the the operation
reduce to y := beta * y. Here, we emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be contemplated if it weren't for the fact
that some BLAS unit tests actually check for this behavior. Also, it
should be emphasized that BLIS, when called natively, does NOT exhibit
this quirky behavior; it will scale y by beta as one would expect. */ \
if ( *m == 0 || *n == 0 ) \
{ \
/* Finalize BLIS. */ \
/*bli_finalize_auto();*/ \
\
return; \
} \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/ \
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
\
/* If alpha is zero, scale y by beta and return early. */ \
if ( PASTEMAC(ch,eq0)( *alpha ) ) \
{ \
PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
m_y, \
( ftype* )beta, \
( ftype* )y0, incy0, \
NULL, \
NULL \
); \
return; \
} \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Declare a void function pointer for the current operation. */ \
PASTECH2(ch,blisname,_unb_ft) f; \
\
/* Choose the underlying implementation. */ \
if ( bli_does_notrans( blis_transa ) ) f = PASTEMAC(ch,gemv_unf_var2); \
else /* if ( bli_does_trans( blis_transa ) ) */ f = PASTEMAC(ch,gemv_unf_var1); \
\
/* Obtain a valid context from the gks. This is needed because these
implementations of ?gemv_() skip calling gemv_ex() and instead
call the unblocked fused variants directly. */ \
cntx_t* cntx = bli_gks_query_cntx(); \
\
/* Invoke the variant chosen above, which loops over a level-1v or
level-1f kernel to implement the current operation. */ \
f \
( \
blis_transa, \
BLIS_NO_CONJUGATE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
cntx \
); \
\
/* Finalize BLIS. */ \
/*bli_finalize_auto();*/ \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( gemv, gemv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/ 0000775 0000000 0000000 00000000000 14634250137 0021310 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_gbmv.c 0000664 0000000 0000000 00000006311 14634250137 0023226 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* transa, \
f77_int* m, \
f77_int* n, \
f77_int* kl, \
f77_int* ku, \
ftype* alpha, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
) \
{ \
trans_t blis_transa; \
dim_t m0, n0; \
dim_t m_y, n_x; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/ \
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n_x, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m_y, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( gbmv, gbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_gbmv.h 0000664 0000000 0000000 00000004545 14634250137 0023242 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* transa, \
f77_int* m, \
f77_int* n, \
f77_int* kl, \
f77_int* ku, \
ftype* alpha, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( gbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hbmv.c 0000664 0000000 0000000 00000005633 14634250137 0023235 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
f77_int* k, \
ftype* alpha, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( hbmv, hbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hbmv.h 0000664 0000000 0000000 00000004440 14634250137 0023235 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
f77_int* k, \
ftype* alpha, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( hbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hpmv.c 0000664 0000000 0000000 00000005542 14634250137 0023252 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* a, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( hpmv, hpmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hpmv.h 0000664 0000000 0000000 00000004347 14634250137 0023261 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* a, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( hpmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hpr.c 0000664 0000000 0000000 00000005242 14634250137 0023066 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype_r* alpha, \
ftype* x, f77_int* incx, \
ftype* a \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( hpr, hpr )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hpr.h 0000664 0000000 0000000 00000004176 14634250137 0023100 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype_r* alpha, \
ftype* x, f77_int* incx, \
ftype* a \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( hpr )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hpr2.c 0000664 0000000 0000000 00000005464 14634250137 0023156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* x, f77_int* incx, \
ftype* y, f77_int* incy, \
ftype* a \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( hpr2, hpr2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_hpr2.h 0000664 0000000 0000000 00000004271 14634250137 0023156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* x, f77_int* incx, \
ftype* y, f77_int* incy, \
ftype* a \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( hpr2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rot.c 0000664 0000000 0000000 00000005245 14634250137 0023104 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCR2
#define GENTFUNCR2( ftype_xy, ftype_r, chxy, chr, blasname, blisname ) \
\
void PASTEF772(chxy,chr,blasname)( \
f77_int* n, \
ftype_xy* x, f77_int* incx, \
ftype_xy* y, f77_int* incy, \
ftype_r* c, \
ftype_r* s \
) \
{ \
dim_t n0; \
ftype_xy* x0; \
ftype_xy* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCR2_BLAS( rot, ROT_KERNEL )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rot.h 0000664 0000000 0000000 00000004273 14634250137 0023111 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTR2
#define GENTPROTR2( ftype_xy, ftype_r, chxy, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF772(chxy,chr,blasname)( \
f77_int* n, \
ftype_xy* x, f77_int* incx, \
ftype_xy* y, f77_int* incy, \
ftype_r* c, \
ftype_r* s \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTR2_BLAS( rot )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rotg.c 0000664 0000000 0000000 00000004247 14634250137 0023254 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCR
#define GENTFUNCR( ftype_xy, ftype_r, chxy, chr, blasname, blisname ) \
\
void PASTEF77(chxy,blasname)( \
ftype_xy* x, \
ftype_xy* y, \
ftype_r* c, \
ftype_r* s \
) \
{ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCR_BLAS( rotg, rotg, ROTG_KERNEL )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rotg.h 0000664 0000000 0000000 00000004125 14634250137 0023254 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTR
#define GENTPROTR( ftype_xy, ftype_r, chxy, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(chxy,blasname)( \
ftype_xy* x, \
ftype_xy* y, \
ftype_r* c, \
ftype_r* s \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTR_BLAS( rotg, rotg )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rotm.c 0000664 0000000 0000000 00000005052 14634250137 0023255 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_int* n, \
ftype* x, f77_int* incx, \
ftype* y, f77_int* incy, \
ftype* dparam \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( rotm, ROTM_KERNEL )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rotm.h 0000664 0000000 0000000 00000004116 14634250137 0023262 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_int* n, \
ftype* x, f77_int* incx, \
ftype* y, f77_int* incy, \
ftype* dparam \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( rotm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rotmg.c 0000664 0000000 0000000 00000004253 14634250137 0023426 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
ftype* d1, \
ftype* d2, \
ftype* x, \
ftype* y, \
ftype* dparam \
) \
{ \
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( rotmg, ROTMG_KERNEL )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_rotmg.h 0000664 0000000 0000000 00000004130 14634250137 0023425 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
ftype* d1, \
ftype* d2, \
ftype* x, \
ftype* y, \
ftype* dparam \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( rotmg )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_sbmv.c 0000664 0000000 0000000 00000005615 14634250137 0023250 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
f77_int* k, \
ftype* alpha, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( sbmv, sbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_sbmv.h 0000664 0000000 0000000 00000004422 14634250137 0023250 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
f77_int* k, \
ftype* alpha, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( sbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_spmv.c 0000664 0000000 0000000 00000005524 14634250137 0023265 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* a, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( spmv, spmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_spmv.h 0000664 0000000 0000000 00000004331 14634250137 0023265 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* a, \
ftype* x, f77_int* incx, \
ftype* beta, \
ftype* y, f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( spmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_spr.c 0000664 0000000 0000000 00000005224 14634250137 0023101 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* x, f77_int* incx, \
ftype* a \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( spr, spr )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_spr.h 0000664 0000000 0000000 00000004160 14634250137 0023104 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* x, f77_int* incx, \
ftype* a \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( spr )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_spr2.c 0000664 0000000 0000000 00000005446 14634250137 0023171 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* x, f77_int* incx, \
ftype* y, f77_int* incy, \
ftype* a \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( spr2, spr2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_spr2.h 0000664 0000000 0000000 00000004253 14634250137 0023171 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_int* m, \
ftype* alpha, \
ftype* x, f77_int* incx, \
ftype* y, f77_int* incy, \
ftype* a \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( spr2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tbmv.c 0000664 0000000 0000000 00000005645 14634250137 0023254 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
f77_int* k, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx \
) \
{ \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( tbmv, tbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tbmv.h 0000664 0000000 0000000 00000004324 14634250137 0023252 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
f77_int* k, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( tbmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tbsv.c 0000664 0000000 0000000 00000005645 14634250137 0023262 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
f77_int* k, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx \
) \
{ \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( tbsv, tbsv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tbsv.h 0000664 0000000 0000000 00000004324 14634250137 0023260 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
f77_int* k, \
ftype* a, f77_int* lda, \
ftype* x, f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( tbsv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tpmv.c 0000664 0000000 0000000 00000005554 14634250137 0023271 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
ftype* a, \
ftype* x, f77_int* incx \
) \
{ \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( tpmv, tpmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tpmv.h 0000664 0000000 0000000 00000004233 14634250137 0023267 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
ftype* a, \
ftype* x, f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( tpmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tpsv.c 0000664 0000000 0000000 00000005554 14634250137 0023277 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
ftype* a, \
ftype* x, f77_int* incx \
) \
{ \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \
\
bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( tpsv, tpsv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/attic/bla_tpsv.h 0000664 0000000 0000000 00000004233 14634250137 0023275 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \
f77_char* uploa, \
f77_char* transa, \
f77_char* diaga, \
f77_int* m, \
ftype* a, \
ftype* x, f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( tpsv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_amax.c 0000664 0000000 0000000 00000006137 14634250137 0022123 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype_x, chx, blasname, blisname ) \
\
f77_int PASTEF772(i,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
) \
{ \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
gint_t bli_index; \
f77_int f77_index; \
\
/* If the vector is empty, return an index of zero. This early check
is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
return 0, which ends up getting incremented to 1 (below) before
being returned, which is not what we want. */ \
if ( *n < 1 || *incx <= 0 ) return 0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
&bli_index, \
NULL, \
NULL \
); \
\
/* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
index. Also, if the BLAS integer size differs from the BLIS
integer size, that typecast occurs here. */ \
f77_index = bli_index + 1; \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return f77_index; \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( amax, amaxv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_amax.h 0000664 0000000 0000000 00000003705 14634250137 0022126 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype_x, chx, blasname ) \
\
BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( amax )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_asum.c 0000664 0000000 0000000 00000005116 14634250137 0022136 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCR2
#define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
\
ftype_r PASTEF772(chr,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
) \
{ \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
ftype_r asum; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
&asum, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return asum; \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCR2_BLAS( asum, asumv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_asum.h 0000664 0000000 0000000 00000003733 14634250137 0022146 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTR2
#define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
\
BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTR2_BLAS( asum )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_axpy.c 0000664 0000000 0000000 00000005334 14634250137 0022154 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
(ftype*)alpha, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( axpy, axpyv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_axpy.h 0000664 0000000 0000000 00000004011 14634250137 0022150 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( axpy )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_copy.c 0000664 0000000 0000000 00000005251 14634250137 0022143 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( copy, copyv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_copy.h 0000664 0000000 0000000 00000003752 14634250137 0022154 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( copy )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_dot.c 0000664 0000000 0000000 00000013174 14634250137 0021762 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
#ifdef BLIS_ENABLE_BLAS
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCDOT
#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
\
ftype PASTEF772(ch,blasname,chc) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
ftype rho; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_conjx, \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
&rho, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return rho; \
}
INSERT_GENTFUNCDOTR_BLAS( dot, dotv )
#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
#else // #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL
// For the "intel" complex return type, use a hidden preceding parameter to
// return the result rather than an actual return value.
#undef GENTFUNCDOT
#define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
\
void PASTEF772(ch,blasname,chc) \
( \
ftype* rhop, \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
ftype rho; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_conjx, \
BLIS_NO_CONJUGATE, \
n0, \
x0, incx0, \
y0, incy0, \
&rho, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
*rhop = rho; \
}
INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
#endif
// -- "Black sheep" dot product function definitions --
// Input vectors stored in single precision, computed in double precision,
// with result returned in single precision.
float PASTEF77(sd,sdot)
(
const f77_int* n,
const float* sb,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
)
{
return ( float )
(
( double )(*sb) +
PASTEF77(d,sdot)
(
n,
x, incx,
y, incy
)
);
}
// Input vectors stored in single precision, computed in double precision,
// with result returned in double precision.
double PASTEF77(d,sdot)
(
const f77_int* n,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
)
{
dim_t n0;
float* x0;
float* y0;
inc_t incx0;
inc_t incy0;
double rho;
dim_t i;
/* Initialization of BLIS is not required. */
/* Convert/typecast negative values of n to zero. */
bli_convert_blas_dim1( *n, n0 );
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */
bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 );
bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 );
rho = 0.0;
for ( i = 0; i < n0; i++ )
{
float* chi1 = x0 + (i )*incx0;
float* psi1 = y0 + (i )*incy0;
bli_ddots( (( double )(*chi1)),
(( double )(*psi1)), rho );
}
/* Finalization of BLIS is not required, because initialization was
not required. */
return rho;
}
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_dot.h 0000664 0000000 0000000 00000005702 14634250137 0021765 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef BLIS_ENABLE_BLAS
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTDOT
#define GENTPROTDOT( ftype, ch, chc, blasname ) \
\
BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \
( \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy \
);
INSERT_GENTPROTDOTR_BLAS( dot )
#ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
INSERT_GENTPROTDOTC_BLAS( dot )
#else
// For the "intel" complex return type, we use a hidden parameter (passed by
// address) to return the result.
#undef GENTPROTDOT
#define GENTPROTDOT( ftype, ch, chc, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \
( \
ftype* rhop, \
const f77_int* n, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy \
);
INSERT_GENTPROTDOTC_BLAS( dot )
#endif
// -- "Black sheep" dot product function prototypes --
BLIS_EXPORT_BLAS float PASTEF77(sd,sdot)
(
const f77_int* n,
const float* sb,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
);
BLIS_EXPORT_BLAS double PASTEF77(d,sdot)
(
const f77_int* n,
const float* x, const f77_int* incx,
const float* y, const f77_int* incy
);
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_gemm.c 0000664 0000000 0000000 00000015677 14634250137 0022133 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019-2022, Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
blis_transb, \
m0, \
n0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
trans_t blis_transa; \
trans_t blis_transb; \
dim_t m0, n0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
transb, \
m, \
n, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Handle special cases of m == 1 or n == 1 via gemv. */ \
if ( n0 == 1 ) \
{ \
dim_t m0t, k0t; \
bli_set_dims_with_trans( blis_transa, m0, k0, &m0t, &k0t ); \
\
PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
bli_extract_conj( blis_transb ), \
m0t, k0t, \
( ftype* )alpha, \
( ftype* )a, rs_a, cs_a, \
( ftype* )b, ( bli_does_notrans( blis_transb ) ? rs_b : cs_b ), \
( ftype* )beta, \
c, rs_c, \
NULL, \
NULL \
); \
return; \
} \
else if ( m0 == 1 ) \
{ \
dim_t n0t, k0t; \
bli_set_dims_with_trans( blis_transb, n0, k0, &n0t, &k0t ); \
\
PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \
( \
blis_transb, \
bli_extract_conj( blis_transa ), \
n0t, k0t, \
( ftype* )alpha, \
( ftype* )b, cs_b, rs_b, \
( ftype* )a, ( bli_does_notrans( blis_transa ) ? cs_a : rs_a ), \
( ftype* )beta, \
c, cs_c, \
NULL, \
NULL \
); \
return; \
} \
\
const num_t dt = PASTEMAC(ch,type); \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( blis_transb, &bo ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( gemm, gemm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_gemm.h 0000664 0000000 0000000 00000004325 14634250137 0022124 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_char* transb, \
const f77_int* m, \
const f77_int* n, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( gemm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_gemv.c 0000664 0000000 0000000 00000011264 14634250137 0022130 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2022, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
) \
{ \
trans_t blis_transa; \
dim_t m0, n0; \
dim_t m_y, n_x; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
transa, \
m, \
n, \
lda, \
incx, \
incy \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Determine the dimensions of x and y so we can adjust the increments,
if necessary.*/ \
bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \
\
/* BLAS handles cases where y has no elements as well as those where x has
no elements. In the case of the former, it cannot do any work since
the output vector is empty; but in the latter case, BLAS has peculiar
semantics. When x has no elements (and transa(A) has no columns), BLAS
returns immediately without performing any computation even if the
number of elements of y (and rows of transa(A)) is non-zero, in which
case any sane interpretations of gemv would have the the operation
reduce to y := beta * y. Here, we emulate the BLAS exactly so as to
provide "bug-for-bug" compatibility. Note that this extreme level of
compatibility would not be contemplated if it weren't for the fact
that some BLAS unit tests actually check for this behavior. Also, it
should be emphasized that BLIS, when called natively, does NOT exhibit
this quirky behavior; it will scale y by beta as one would expect. */ \
if ( m_y > 0 && n_x == 0 ) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_transa, \
BLIS_NO_CONJUGATE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( gemv, gemv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_gemv.h 0000664 0000000 0000000 00000004232 14634250137 0022132 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* transa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( gemv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_ger.c 0000664 0000000 0000000 00000006310 14634250137 0021743 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCDOT
#define GENTFUNCDOT( ftype, ch, chc, blis_conjy, blasname, blisname ) \
\
void PASTEF772(ch,blasname,chc) \
( \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy, \
ftype* a, const f77_int* lda \
) \
{ \
dim_t m0, n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
MKSTR(chc), \
m, \
n, \
incx, \
incy, \
lda \
); \
\
/* Convert/typecast negative values of m and n to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
blis_conjy, \
m0, \
n0, \
(ftype*)alpha, \
x0, incx0, \
y0, incy0, \
(ftype*)a, rs_a, cs_a, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCDOT_BLAS( ger, ger )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_ger.h 0000664 0000000 0000000 00000004152 14634250137 0021752 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTDOT
#define GENTPROTDOT( ftype, chxy, chc, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \
( \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy, \
ftype* a, const f77_int* lda \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTDOT_BLAS( ger )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_hemm.c 0000664 0000000 0000000 00000014076 14634250137 0022124 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
m, \
n, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_side, \
blis_uploa, \
BLIS_NO_CONJUGATE, \
BLIS_NO_TRANSPOSE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
m, \
n, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const conj_t conja = BLIS_NO_CONJUGATE; \
const trans_t transb = BLIS_NO_TRANSPOSE; \
const struc_t struca = BLIS_HERMITIAN; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \
bli_set_dims_with_trans( transb, m0, n0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
blis_side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( hemm, hemm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_hemm.h 0000664 0000000 0000000 00000004312 14634250137 0022121 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( hemm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_hemv.c 0000664 0000000 0000000 00000006560 14634250137 0022134 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
m, \
lda, \
incx, \
incy \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
BLIS_NO_CONJUGATE, \
BLIS_NO_CONJUGATE, \
m0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( hemv, hemv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_hemv.h 0000664 0000000 0000000 00000004221 14634250137 0022131 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( hemv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_her.c 0000664 0000000 0000000 00000006174 14634250137 0021754 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype_r* alpha, \
const ftype* x, const f77_int* incx, \
ftype* a, const f77_int* lda \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
m, \
incx, \
lda \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
BLIS_NO_CONJUGATE, \
m0, \
(ftype_r*)alpha, \
x0, incx0, \
(ftype*)a, rs_a, cs_a, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( her, her )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_her.h 0000664 0000000 0000000 00000004100 14634250137 0021744 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype_r* alpha, \
const ftype* x, const f77_int* incx, \
ftype* a, const f77_int* lda \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( her )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_her2.c 0000664 0000000 0000000 00000006476 14634250137 0022043 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy, \
ftype* a, const f77_int* lda \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
m, \
incx, \
incy, \
lda \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
BLIS_NO_CONJUGATE, \
BLIS_NO_CONJUGATE, \
m0, \
(ftype*)alpha, \
x0, incx0, \
y0, incy0, \
(ftype*)a, rs_a, cs_a, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( her2, her2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_her2.h 0000664 0000000 0000000 00000004162 14634250137 0022036 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy, \
ftype* a, const f77_int* lda \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( her2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_her2k.c 0000664 0000000 0000000 00000016003 14634250137 0022201 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype_r* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* We emulate the BLAS early return behavior with the following
conditional, which returns if one of the following is true:
- matrix C is empty
- the rank-2k product is empty (either because alpha is zero or k
is zero) AND matrix C is not scaled. */ \
if ( m0 == 0 || \
( ( PASTEMAC(ch,eq0)( *alpha ) || k0 == 0 ) \
&& PASTEMAC(chr,eq1)( *beta ) \
) \
) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploc, \
blis_transa, \
blis_transa, \
m0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype_r*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype_r* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* We emulate the BLAS early return behavior with the following
conditional, which returns if one of the following is true:
- matrix C is empty
- the rank-2k product is empty (either because alpha is zero or k
is zero) AND matrix C is not scaled. */ \
if ( m0 == 0 || \
( ( PASTEMAC(ch,eq0)( *alpha ) || k0 == 0 ) \
&& PASTEMAC(chr,eq1)( *beta ) \
) \
) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
const trans_t transb = blis_transa; \
const struc_t strucc = BLIS_HERMITIAN; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( transb, m0, k0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype* )alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, (ftype_r*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploc, &co ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( strucc, &co ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( her2k, her2k )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_her2k.h 0000664 0000000 0000000 00000004315 14634250137 0022211 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype_r* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( her2k )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_herk.c 0000664 0000000 0000000 00000014712 14634250137 0022124 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype_r* alpha, \
const ftype* a, const f77_int* lda, \
const ftype_r* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* We emulate the BLAS early return behavior with the following
conditional, which returns if one of the following is true:
- matrix C is empty
- the rank-k product is empty (either because alpha is zero or k
is zero) AND matrix C is not scaled. */ \
if ( m0 == 0 || \
( ( PASTEMAC(chr,eq0)( *alpha ) || k0 == 0 ) \
&& PASTEMAC(chr,eq1)( *beta ) \
) \
) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploc, \
blis_transa, \
m0, \
k0, \
(ftype_r*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype_r*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNCCO
#define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype_r* alpha, \
const ftype* a, const f77_int* lda, \
const ftype_r* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* We emulate the BLAS early return behavior with the following
conditional, which returns if one of the following is true:
- matrix C is empty
- the rank-k product is empty (either because alpha is zero or k
is zero) AND matrix C is not scaled. */ \
if ( m0 == 0 || \
( ( PASTEMAC(chr,eq0)( *alpha ) || k0 == 0 ) \
&& PASTEMAC(chr,eq1)( *beta ) \
) \
) \
{ \
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return; \
} \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt_r = PASTEMAC(chr,type); \
const num_t dt = PASTEMAC(ch,type); \
\
const struc_t strucc = BLIS_HERMITIAN; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
\
bli_obj_init_finish_1x1( dt_r, (ftype_r*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt_r, (ftype_r*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploc, &co ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
\
bli_obj_set_struc( strucc, &co ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCCO_BLAS( herk, herk )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_herk.h 0000664 0000000 0000000 00000004234 14634250137 0022127 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTCO
#define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype_r* alpha, \
const ftype* a, const f77_int* lda, \
const ftype_r* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTCO_BLAS( herk )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_nrm2.c 0000664 0000000 0000000 00000005117 14634250137 0022050 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCR2
#define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
\
ftype_r PASTEF772(chr,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
) \
{ \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
ftype_r norm; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
&norm, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
\
return norm; \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCR2_BLAS( nrm2, normfv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_nrm2.h 0000664 0000000 0000000 00000003733 14634250137 0022057 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTR2
#define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
\
BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
( \
const f77_int* n, \
const ftype_x* x, const f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTR2_BLAS( nrm2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_scal.c 0000664 0000000 0000000 00000005704 14634250137 0022116 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCSCAL
#define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
\
void PASTEF772(chx,cha,blasname) \
( \
const f77_int* n, \
const ftype_a* alpha, \
ftype_x* x, const f77_int* incx \
) \
{ \
dim_t n0; \
ftype_x* x0; \
inc_t incx0; \
ftype_x alpha_cast; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \
\
/* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS.
that is, we just always sub-optimally implement those cases
by casting alpha to ctype_x (potentially the complex domain) and
using the homogeneous datatype instance according to that type. */ \
PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \
( \
BLIS_NO_CONJUGATE, \
n0, \
&alpha_cast, \
x0, incx0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_scal.h 0000664 0000000 0000000 00000003767 14634250137 0022132 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTSCAL
#define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \
( \
const f77_int* n, \
const ftype_a* alpha, \
ftype_x* x, const f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTSCAL_BLAS( scal )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_swap.c 0000664 0000000 0000000 00000005205 14634250137 0022142 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
) \
{ \
dim_t n0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Convert/typecast negative values of n to zero. */ \
bli_convert_blas_dim1( *n, n0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
n0, \
x0, incx0, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( swap, swapv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_swap.h 0000664 0000000 0000000 00000003736 14634250137 0022156 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_int* n, \
ftype* x, const f77_int* incx, \
ftype* y, const f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( swap )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_symm.c 0000664 0000000 0000000 00000014030 14634250137 0022151 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
m, \
n, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_side, \
blis_uploa, \
BLIS_NO_CONJUGATE, \
BLIS_NO_TRANSPOSE, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
m, \
n, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const conj_t conja = BLIS_NO_CONJUGATE; \
const trans_t transb = BLIS_NO_TRANSPOSE; \
const struc_t struca = BLIS_SYMMETRIC; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \
bli_set_dims_with_trans( transb, m0, n0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploa, &ao ); \
bli_obj_set_conj( conja, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
blis_side, \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( symm, symm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_symm.h 0000664 0000000 0000000 00000004266 14634250137 0022170 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( symm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_symv.c 0000664 0000000 0000000 00000006542 14634250137 0022173 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
m, \
lda, \
incx, \
incy \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
BLIS_NO_CONJUGATE, \
BLIS_NO_CONJUGATE, \
m0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
(ftype*)beta, \
y0, incy0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( symv, symv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_symv.h 0000664 0000000 0000000 00000004203 14634250137 0022170 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* x, const f77_int* incx, \
const ftype* beta, \
ftype* y, const f77_int* incy \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( symv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syr.c 0000664 0000000 0000000 00000006154 14634250137 0022011 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
ftype* a, const f77_int* lda \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
m, \
incx, \
lda \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
BLIS_NO_CONJUGATE, \
m0, \
(ftype*)alpha, \
x0, incx0, \
(ftype*)a, rs_a, cs_a, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( syr, syr )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syr.h 0000664 0000000 0000000 00000004062 14634250137 0022012 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
ftype* a, const f77_int* lda \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( syr )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syr2.c 0000664 0000000 0000000 00000006462 14634250137 0022075 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNCRO
#define GENTFUNCRO( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy, \
ftype* a, const f77_int* lda \
) \
{ \
uplo_t blis_uploa; \
dim_t m0; \
ftype* x0; \
ftype* y0; \
inc_t incx0; \
inc_t incy0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
m, \
incx, \
incy, \
lda \
); \
\
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
BLIS_NO_CONJUGATE, \
BLIS_NO_CONJUGATE, \
m0, \
(ftype*)alpha, \
x0, incx0, \
y0, incy0, \
(ftype*)a, rs_a, cs_a, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNCRO_BLAS( syr2, syr2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syr2.h 0000664 0000000 0000000 00000004144 14634250137 0022075 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROTRO
#define GENTPROTRO( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_int* m, \
const ftype* alpha, \
const ftype* x, const f77_int* incx, \
const ftype* y, const f77_int* incy, \
ftype* a, const f77_int* lda \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROTRO_BLAS( syr2 )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syr2k.c 0000664 0000000 0000000 00000015217 14634250137 0022246 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* The real domain ssyr2k and dsyr2k in netlib BLAS treat a trans value
of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have
to go out of our way a little to support this behavior. */ \
if ( bli_is_real( PASTEMAC(ch,type) ) && \
bli_is_conjtrans( blis_transa ) ) \
{ \
blis_transa = BLIS_TRANSPOSE; \
} \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploc, \
blis_transa, \
blis_transa, \
m0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldb, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* The real domain ssyr2k and dsyr2k in netlib BLAS treat a trans value
of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have
to go out of our way a little to support this behavior. */ \
if ( bli_is_real( PASTEMAC(ch,type) ) && \
bli_is_conjtrans( blis_transa ) ) \
{ \
blis_transa = BLIS_TRANSPOSE; \
} \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const trans_t transb = blis_transa; \
const struc_t strucc = BLIS_SYMMETRIC; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
dim_t m0_b, n0_b; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
bli_set_dims_with_trans( transb, m0, k0, &m0_b, &n0_b ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \
bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploc, &co ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
bli_obj_set_conjtrans( transb, &bo ); \
\
bli_obj_set_struc( strucc, &co ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&bo, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( syr2k, syr2k )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syr2k.h 0000664 0000000 0000000 00000004271 14634250137 0022251 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* b, const f77_int* ldb, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( syr2k )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syrk.c 0000664 0000000 0000000 00000014120 14634250137 0022154 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* The real domain ssyrk and dsyrk in netlib BLAS treat a trans value
of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have
to go out of our way a little to support this behavior. */ \
if ( bli_is_real( PASTEMAC(ch,type) ) && \
bli_is_conjtrans( blis_transa ) ) \
{ \
blis_transa = BLIS_TRANSPOSE; \
} \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploc, \
blis_transa, \
m0, \
k0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)beta, \
(ftype*)c, rs_c, cs_c, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
) \
{ \
uplo_t blis_uploc; \
trans_t blis_transa; \
dim_t m0, k0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploc, \
transa, \
m, \
k, \
lda, \
ldc \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
\
/* The real domain ssyrk and dsyrk in netlib BLAS treat a trans value
of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have
to go out of our way a little to support this behavior. */ \
if ( bli_is_real( PASTEMAC(ch,type) ) && \
bli_is_conjtrans( blis_transa ) ) \
{ \
blis_transa = BLIS_TRANSPOSE; \
} \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *k, k0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_c = 1; \
const inc_t cs_c = *ldc; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const struc_t strucc = BLIS_SYMMETRIC; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t co = BLIS_OBJECT_INITIALIZER; \
\
dim_t m0_a, n0_a; \
\
bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \
\
bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \
\
bli_obj_set_uplo( blis_uploc, &co ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
\
bli_obj_set_struc( strucc, &co ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
&alphao, \
&ao, \
&betao, \
&co, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( syrk, syrk )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_syrk.h 0000664 0000000 0000000 00000004210 14634250137 0022160 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploc, \
const f77_char* transa, \
const f77_int* m, \
const f77_int* k, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
const ftype* beta, \
ftype* c, const f77_int* ldc \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( syrk )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trmm.c 0000664 0000000 0000000 00000013446 14634250137 0022155 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
ftype* b, const f77_int* ldb \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
transa, \
diaga, \
m, \
n, \
lda, \
ldb \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_side, \
blis_uploa, \
blis_transa, \
blis_diaga, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
ftype* b, const f77_int* ldb \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
transa, \
diaga, \
m, \
n, \
lda, \
ldb \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const struc_t struca = BLIS_TRIANGULAR; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn0_a; \
\
bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
\
bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)b, rs_b, cs_b, &bo ); \
\
bli_obj_set_uplo( blis_uploa, &ao ); \
bli_obj_set_diag( blis_diaga, &ao ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
blis_side, \
&alphao, \
&ao, \
&bo, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( trmm, trmm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trmm.h 0000664 0000000 0000000 00000004250 14634250137 0022153 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
ftype* b, const f77_int* ldb \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( trmm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trmv.c 0000664 0000000 0000000 00000006703 14634250137 0022164 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const ftype* a, const f77_int* lda, \
ftype* x, const f77_int* incx \
) \
{ \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
ftype* one_p; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
transa, \
diaga, \
m, \
lda, \
incx \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Acquire a pointer to the global scalar constant BLIS_ONE. */ \
one_p = PASTEMAC(ch,1); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
blis_transa, \
blis_diaga, \
m0, \
one_p, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( trmv, trmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trmv.h 0000664 0000000 0000000 00000004116 14634250137 0022165 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const ftype* a, const f77_int* lda, \
ftype* x, const f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( trmv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trsm.c 0000664 0000000 0000000 00000013446 14634250137 0022163 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2019, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#ifdef BLIS_BLAS3_CALLS_TAPI
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
ftype* b, const f77_int* ldb \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
transa, \
diaga, \
m, \
n, \
lda, \
ldb \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_side, \
blis_uploa, \
blis_transa, \
blis_diaga, \
m0, \
n0, \
(ftype*)alpha, \
(ftype*)a, rs_a, cs_a, \
(ftype*)b, rs_b, cs_b, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#else
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
ftype* b, const f77_int* ldb \
) \
{ \
side_t blis_side; \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0, n0; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
side, \
uploa, \
transa, \
diaga, \
m, \
n, \
lda, \
ldb \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_side( *side, &blis_side ); \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Typecast BLAS integers to BLIS integers. */ \
bli_convert_blas_dim1( *m, m0 ); \
bli_convert_blas_dim1( *n, n0 ); \
\
/* Set the row and column strides of the matrix operands. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
const inc_t rs_b = 1; \
const inc_t cs_b = *ldb; \
\
const num_t dt = PASTEMAC(ch,type); \
\
const struc_t struca = BLIS_TRIANGULAR; \
\
obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \
obj_t ao = BLIS_OBJECT_INITIALIZER; \
obj_t bo = BLIS_OBJECT_INITIALIZER; \
\
dim_t mn0_a; \
\
bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \
\
bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \
\
bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \
bli_obj_init_finish( dt, m0, n0, (ftype*)b, rs_b, cs_b, &bo ); \
\
bli_obj_set_uplo( blis_uploa, &ao ); \
bli_obj_set_diag( blis_diaga, &ao ); \
bli_obj_set_conjtrans( blis_transa, &ao ); \
\
bli_obj_set_struc( struca, &ao ); \
\
PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \
( \
blis_side, \
&alphao, \
&ao, \
&bo, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#endif
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( trsm, trsm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trsm.h 0000664 0000000 0000000 00000004250 14634250137 0022161 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* side, \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const f77_int* n, \
const ftype* alpha, \
const ftype* a, const f77_int* lda, \
ftype* b, const f77_int* ldb \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( trsm )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trsv.c 0000664 0000000 0000000 00000006703 14634250137 0022172 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define BLAS-to-BLIS interfaces.
//
#undef GENTFUNC
#define GENTFUNC( ftype, ch, blasname, blisname ) \
\
void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const ftype* a, const f77_int* lda, \
ftype* x, const f77_int* incx \
) \
{ \
uplo_t blis_uploa; \
trans_t blis_transa; \
diag_t blis_diaga; \
dim_t m0; \
ftype* x0; \
inc_t incx0; \
ftype* one_p; \
\
/* Initialize BLIS. */ \
bli_init_auto(); \
\
/* Perform BLAS parameter checking. */ \
PASTEBLACHK(blasname) \
( \
MKSTR(ch), \
MKSTR(blasname), \
uploa, \
transa, \
diaga, \
m, \
lda, \
incx \
); \
\
/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \
bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \
\
/* Convert/typecast negative values of m to zero. */ \
bli_convert_blas_dim1( *m, m0 ); \
\
/* If the input increments are negative, adjust the pointers so we can
use positive increments instead. */ \
bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \
\
/* Set the row and column strides of A. */ \
const inc_t rs_a = 1; \
const inc_t cs_a = *lda; \
\
/* Acquire a pointer to the global scalar constant BLIS_ONE. */ \
one_p = PASTEMAC(ch,1); \
\
/* Call BLIS interface. */ \
PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
( \
blis_uploa, \
blis_transa, \
blis_diaga, \
m0, \
one_p, \
(ftype*)a, rs_a, cs_a, \
x0, incx0, \
NULL, \
NULL \
); \
\
/* Finalize BLIS. */ \
bli_finalize_auto(); \
}
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTFUNC_BLAS( trsv, trsv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bla_trsv.h 0000664 0000000 0000000 00000004116 14634250137 0022173 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype BLAS-to-BLIS interfaces.
//
#undef GENTPROT
#define GENTPROT( ftype, ch, blasname ) \
\
BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
( \
const f77_char* uploa, \
const f77_char* transa, \
const f77_char* diaga, \
const f77_int* m, \
const ftype* a, const f77_int* lda, \
ftype* x, const f77_int* incx \
);
#ifdef BLIS_ENABLE_BLAS
INSERT_GENTPROT_BLAS( trsv )
#endif
cython-blis-1.0.0/blis/_src/frame/compat/bli_blas.h 0000664 0000000 0000000 00000012643 14634250137 0022132 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// If the CBLAS compatibility layer was enabled while the BLAS layer
// was not enabled, we must enable it here.
#ifdef BLIS_ENABLE_CBLAS
#ifndef BLIS_ENABLE_BLAS
#define BLIS_ENABLE_BLAS
#endif
#endif // BLIS_ENABLE_CBLAS
// By default, if the BLAS compatibility layer is enabled, we define
// (include) all of the BLAS prototypes. However, if the user is
// #including "blis.h" and also #including another header that also
// declares the BLAS functions, then we provide an opportunity to
// #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below).
#ifdef BLIS_ENABLE_BLAS
#define BLIS_ENABLE_BLAS_DEFS
#else
#undef BLIS_ENABLE_BLAS_DEFS
#endif
// Skip prototyping all of the BLAS if the BLAS test drivers are being
// compiled.
#ifdef BLIS_VIA_BLASTEST
#undef BLIS_ENABLE_BLAS_DEFS
#endif
// Skip prototyping all of the BLAS if the environment has defined the
// macro BLIS_DISABLE_BLAS_DEFS.
#ifdef BLIS_DISABLE_BLAS_DEFS
#undef BLIS_ENABLE_BLAS_DEFS
#endif
// Begin including all BLAS prototypes.
#ifdef BLIS_ENABLE_BLAS_DEFS
// -- System headers needed by BLAS compatibility layer --
#include // for toupper(), used in xerbla()
// -- Constants --
#define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1)
// -- Utility macros --
#include "bla_r_sign.h"
#include "bla_d_sign.h"
#include "bla_r_cnjg.h"
#include "bla_d_cnjg.h"
#include "bla_r_imag.h"
#include "bla_d_imag.h"
#include "bla_c_div.h"
#include "bla_z_div.h"
#include "bla_f__cabs.h" // needed by c_abs, z_abs
#include "bla_r_abs.h"
#include "bla_d_abs.h"
#include "bla_c_abs.h"
#include "bla_z_abs.h"
#include "bla_lsame.h"
#include "bla_xerbla.h"
#include "bla_xerbla_array.h"
// -- Level-0 BLAS prototypes --
#include "bla_cabs1.h"
// -- Level-1 BLAS prototypes --
#include "bla_amax.h"
#include "bla_asum.h"
#include "bla_axpy.h"
#include "bla_copy.h"
#include "bla_dot.h"
#include "bla_nrm2.h"
#include "bla_rot.h"
#include "bla_rotg.h"
#include "bla_rotm.h"
#include "bla_rotmg.h"
#include "bla_scal.h"
#include "bla_swap.h"
#include "f77_amax_sub.h"
#include "f77_asum_sub.h"
#include "f77_dot_sub.h"
#include "f77_nrm2_sub.h"
// -- Level-2 BLAS prototypes --
// dense
#include "bla_gemv.h"
#include "bla_ger.h"
#include "bla_hemv.h"
#include "bla_her.h"
#include "bla_her2.h"
#include "bla_symv.h"
#include "bla_syr.h"
#include "bla_syr2.h"
#include "bla_trmv.h"
#include "bla_trsv.h"
#include "bla_gemv_check.h"
#include "bla_ger_check.h"
#include "bla_hemv_check.h"
#include "bla_her_check.h"
#include "bla_her2_check.h"
#include "bla_symv_check.h"
#include "bla_syr_check.h"
#include "bla_syr2_check.h"
#include "bla_trmv_check.h"
#include "bla_trsv_check.h"
// packed
#include "bla_hpmv.h"
#include "bla_hpr.h"
#include "bla_hpr2.h"
#include "bla_spmv.h"
#include "bla_spr.h"
#include "bla_spr2.h"
#include "bla_tpmv.h"
#include "bla_tpsv.h"
// banded
#include "bla_gbmv.h"
#include "bla_hbmv.h"
#include "bla_sbmv.h"
#include "bla_tbmv.h"
#include "bla_tbsv.h"
// -- Level-3 BLAS prototypes --
#include "bla_gemm.h"
#include "bla_hemm.h"
#include "bla_herk.h"
#include "bla_her2k.h"
#include "bla_symm.h"
#include "bla_syrk.h"
#include "bla_syr2k.h"
#include "bla_trmm.h"
#include "bla_trsm.h"
#include "bla_gemm_check.h"
#include "bla_hemm_check.h"
#include "bla_herk_check.h"
#include "bla_her2k_check.h"
#include "bla_symm_check.h"
#include "bla_syrk_check.h"
#include "bla_syr2k_check.h"
#include "bla_trmm_check.h"
#include "bla_trsm_check.h"
// -- BLAS extension prototypes --
// unique to BLIS
#include "bla_axpby.h"
// level-3
#include "bla_gemmt.h"
#include "bla_gemmt_check.h"
// batch
#include "bla_gemm_batch.h"
// 3m
#include "bla_gemm3m.h"
#include "bla_gemm3m_check.h"
// -- Fortran-compatible APIs to BLIS functions --
#include "b77_thread.h"
#endif // BLIS_ENABLE_BLAS
cython-blis-1.0.0/blis/_src/frame/compat/blis/ 0000775 0000000 0000000 00000000000 14634250137 0021135 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/compat/blis/thread/ 0000775 0000000 0000000 00000000000 14634250137 0022404 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/compat/blis/thread/b77_thread.c 0000664 0000000 0000000 00000005302 14634250137 0024476 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2018, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "blis.h"
//
// Define Fortran-compatible BLIS interfaces.
//
void PASTEF770(bli_thread_set_ways)
(
const f77_int* jc,
const f77_int* pc,
const f77_int* ic,
const f77_int* jr,
const f77_int* ir
)
{
dim_t jc0 = *jc;
dim_t pc0 = *pc;
dim_t ic0 = *ic;
dim_t jr0 = *jr;
dim_t ir0 = *ir;
// Initialize BLIS.
bli_init_auto();
// Convert/typecast negative values to zero.
//bli_convert_blas_dim1( *jc, jc0 );
//bli_convert_blas_dim1( *pc, pc0 );
//bli_convert_blas_dim1( *ic, ic0 );
//bli_convert_blas_dim1( *jr, jr0 );
//bli_convert_blas_dim1( *ir, ir0 );
// Call the BLIS function.
bli_thread_set_ways( jc0, pc0, ic0, jr0, ir0 );
// Finalize BLIS.
bli_finalize_auto();
}
void PASTEF770(bli_thread_set_num_threads)
(
const f77_int* nt
)
{
dim_t nt0 = *nt;
// Initialize BLIS.
bli_init_auto();
// Convert/typecast negative values to zero.
//bli_convert_blas_dim1( *nt, nt0 );
// Call the BLIS function.
bli_thread_set_num_threads( nt0 );
// Finalize BLIS.
bli_finalize_auto();
}
cython-blis-1.0.0/blis/_src/frame/compat/blis/thread/b77_thread.h 0000664 0000000 0000000 00000003755 14634250137 0024515 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// Prototype Fortran-compatible BLIS interfaces.
//
BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways)
(
const f77_int* jc,
const f77_int* pc,
const f77_int* ic,
const f77_int* jr,
const f77_int* ir
);
BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads)
(
const f77_int* nt
);
cython-blis-1.0.0/blis/_src/frame/compat/cblas/ 0000775 0000000 0000000 00000000000 14634250137 0021270 5 ustar 00root root 0000000 0000000 cython-blis-1.0.0/blis/_src/frame/compat/cblas/bli_cblas.h 0000664 0000000 0000000 00000004155 14634250137 0023360 0 ustar 00root root 0000000 0000000 /*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef BLIS_CBLAS_H
#define BLIS_CBLAS_H
#ifdef BLIS_ENABLE_CBLAS
// Undefine these macros so that no internal conversion is done by CBLAS.
// The function signatures have been modified to use the proper integer types
// directly.
#undef F77_INT
#undef F77_CHAR
// Include the main CBLAS header so that including this header file
// (probably via blis.h) allows applications to access CBLAS
// prototypes and definitions.
#include "cblas.h"
#endif // BLIS_ENABLE_CBLAS
#endif
cython-blis-1.0.0/blis/_src/frame/compat/cblas/cblas.tgz 0000664 0000000 0000000 00000601534 14634250137 0023113 0 ustar 00root root 0000000 0000000 8M }80N>
y>BwMH 9$L_JmٖeR*JrU%ӫ\.yxH?f}ƁqhJ吔
889,v)hh)A{s>sM:kydͲ!_9::x0z2 ll?hx?Y'~8RqXͿQ)Ua+j
rJ/
8yEJ-.Cxk?Jp
dk=Gejo҇ϗp@̉ LԔ̾Sgڜ32>["f/O6lɗhrg!$dl
٬?Ff?#O}U
P=A'|8DfO{3,3twv#HN5hB3|:)GM=d<+%4ӼwxB `F? d843Z}0EhpH(Q?o<>t8J?MlݹO3L`4|yZk|'cWq"a# vJk F%NV\vfy4P8l5zĨkU*ʪH[]7TCaJqYIeCP[O2S23*j(*REV#-֓"BZEi\Hu!u3J ]5+6Cɘ>QP`ܙӌHkFS0tƩ,Eh$A2lj8جf%\S4Xhw/c^̭8}` D0'ǦZ`dvr҄6c} #VhM:uC[_`qE'XO 66*nT$z-0U(Bg7!@>*(Į.l`Jdp;>Be,ze2$lMFB
^hC:
o֍xN~>'\q(]vlS^%H9o5x8{
*cWk`5eM8^n}(aV@W;8#t6bѲ77?]?e?fbQV6?y\Mwq,r `#hyGk4J٭FE^xܗ\$u/[@f*}[tc?gDpFRi!ʓR
0<,Qg&f݁鏔^[huQ;kt^w2QVMF7d2ڮCdӢ4|[wa7n
3wV;;ک|O^(FQab|Oq:Vj[VJ٣nujmFSj[Ue'^^./+գJU6
e[ORI_.d4/+Ҥ{)LeUeq}DÊ_c#