pax_global_header00006660000000000000000000000064150022253160014506gustar00rootroot0000000000000052 comment=8aa8b822c8f6dac6272bb0749a498e3297b88510 compressed-tensors-0.9.4/000077500000000000000000000000001500222531600153575ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/000077500000000000000000000000001500222531600167175ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/.gitkeep000066400000000000000000000000001500222531600203360ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/actions/000077500000000000000000000000001500222531600203575ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/actions/test/000077500000000000000000000000001500222531600213365ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/actions/test/action.yml000066400000000000000000000016311500222531600233370ustar00rootroot00000000000000name: test compressed-tensors description: 'test compressed-tensors' inputs: venv: description: "path of virtualenv" required: true suitename: description: "test suite name" required: true outputs: status: description: "final status from test" value: ${{ steps.test.outputs.status }} runs: using: composite steps: - name: install wheel uses: neuralmagic/nm-actions/actions/install-whl@v1.2.0 with: venv: ${{ inputs.venv }} name: compressed extra: "[dev,accelerate]" - name: test id: test run: | source ${{ inputs.venv }}/bin/activate rm -rf src SUCCESS=0 pytest tests --junitxml=test-results/report.xml -o junit_suite_name="${{ inputs.suitename }}" || SUCCESS=$? echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT" deactivate exit ${SUCCESS} shell: bash compressed-tensors-0.9.4/.github/scripts/000077500000000000000000000000001500222531600204065ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/scripts/step-status000077500000000000000000000004011500222531600226230ustar00rootroot00000000000000#!/bin/bash -e # echo "green encased checkmark" if "${1} == 0" # echo "red X" if "${1} != 0" STEP_STATUS=${1} if [ "$STEP_STATUS" -eq 0 ]; then # green check echo -e "\xE2\x9C\x85" else # red x echo -e "\xE2\x9D\x8C" fi compressed-tensors-0.9.4/.github/workflows/000077500000000000000000000000001500222531600207545ustar00rootroot00000000000000compressed-tensors-0.9.4/.github/workflows/build-test.yml000066400000000000000000000043561500222531600235630ustar00rootroot00000000000000name: build-test on: # makes workflow reusable workflow_call: inputs: wf_category: description: "workflow category: NIGHTLY, RELEASE" type: string default: NIGHTLY push_to_pypi: description: "When set to true, built whl and tar.gz will be pushed to public pypi if all tests pass" type: boolean default: false gitref: description: "git commit hash or tag name" type: string default: main # build related parameters build_label: description: "requested runner label for build (specifies instance)" type: string default: ubuntu-22.04 # test related parameters test_configs: description: "python, label, timeout" type: string required: true jobs: BUILD: uses: ./.github/workflows/build.yml with: wf_category: ${{ inputs.wf_category }} build_label: ${{ inputs.build_label }} gitref: ${{ inputs.gitref }} timeout: 20 secrets: inherit TEST: needs: [BUILD] strategy: fail-fast: false matrix: test_config: ${{ fromJson(inputs.test_configs) }} uses: ./.github/workflows/test.yml with: gitref: ${{ inputs.gitref }} test_label: ${{ matrix.test_config.label }} python: ${{ matrix.test_config.python }} timeout: ${{ matrix.test_config.timeout }} whl: ${{ needs.BUILD.outputs.whl }} secrets: inherit UPLOAD: needs: [TEST] uses: ./.github/workflows/upload.yml with: label: k8s-util timeout: 40 run_id: ${{ github.run_id }} push_to_pypi: ${{ inputs.push_to_pypi }} secrets: inherit REPORT: needs: [BUILD, TEST] if: success() || failure() uses: ./.github/workflows/report.yml with: label: rh-reporter timeout: 40 run_id: ${{ github.run_id }} run_name: compressed-tensors wheel: ${{ needs.BUILD.outputs.whl }} wf_category: ${{ inputs.wf_category }} gitref: ${{ inputs.gitref }} secrets: inherit compressed-tensors-0.9.4/.github/workflows/build.yml000066400000000000000000000104771500222531600226070ustar00rootroot00000000000000name: build on: # makes workflow reusable workflow_call: inputs: wf_category: description: "categories: NIGHTLY, RELEASE" type: string default: NIGHTLY build_label: description: "requested runner label (specifies instance)" type: string required: true timeout: description: "time limit for run in minutes " type: string default: 20 gitref: description: "git commit hash or branch name" type: string default: main outputs: whl: description: 'basename for generated whl' value: ${{ jobs.BUILD.outputs.whl }} # makes workflow manually callable workflow_dispatch: inputs: wf_category: description: "categories: NIGHTLY, RELEASE" type: string default: NIGHTLY build_label: description: "requested runner label (specifies instance)" type: string required: true timeout: description: "time limit for run in minutes " type: string default: 20 gitref: description: "git commit hash or branch name" type: string default: main jobs: BUILD: runs-on: ${{ inputs.build_label }} timeout-minutes: ${{ fromJson(inputs.timeout) }} permissions: contents: 'read' id-token: 'write' outputs: run_id: ${{ github.run_id }} whl: ${{ steps.build.outputs.whlname }} tarfile: ${{ steps.build.outputs.tarname }} steps: - name: set python uses: actions/setup-python@v4 with: python-version: '3.12' - name: checkout code id: checkout uses: actions/checkout@v4 with: fetch-depth: 0 fetch-tags: true ref: ${{ inputs.gitref }} - name: build id: build uses: neuralmagic/nm-actions/actions/build-ml-whl@v1.18.0 with: dev: false release: ${{ inputs.wf_category == 'RELEASE' }} # GCP - name: 'Authenticate to Google Cloud' id: auth uses: google-github-actions/auth@v2.1.3 with: project_id: ${{ secrets.GCP_PROJECT }} workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_GHA_SA }} - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v2' with: version: '>= 473.0.0' - name: copy whl and source distribution run: | gcloud storage cp dist/${{ steps.build.outputs.whlname }} ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/${{ steps.build.outputs.whlname }} gcloud storage cp dist/${{ steps.build.outputs.tarname }} ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/${{ steps.build.outputs.tarname }} - name: upload whl uses: actions/upload-artifact@v4 if: success() || failure() with: name: ${{ steps.build.outputs.whlname }} path: dist/${{ steps.build.outputs.whlname }} retention-days: 5 - name: upload tar.gz uses: actions/upload-artifact@v4 if: success() || failure() with: name: ${{ steps.build.outputs.tarname }} path: dist/${{ steps.build.outputs.tarname }} retention-days: 5 - name: summary uses: neuralmagic/nm-actions/actions/summary-build@v1.2.0 if: success() || failure() with: label: ${{ inputs.build_label }} gitref: ${{ inputs.gitref }} whl_status: ${{ steps.build.outputs.status }} - name: run status id: run_status if: success() || failure() env: WHL_STATUS: ${{ steps.build.outputs.status }} run: | echo "build status: ${WHL_STATUS}" if [ -z "${WHL_STATUS}" ] || [ "${WHL_STATUS}" -ne "0" ]; then exit 1; fi compressed-tensors-0.9.4/.github/workflows/report.yml000066400000000000000000000111701500222531600230120ustar00rootroot00000000000000name: report test results to reportportal run-name: ${{ github.actor }} report results for run ${{ inputs.run_id }} on: workflow_call: inputs: label: description: "requested runner label (specifies instance)" type: string required: true timeout: description: "time limit for run in minutes " type: string required: true run_id: description: "run_id of 'build.yml' run that generated the assets" type: string required: true wheel: description: wheel used for testing type: string required: true run_name: description: name of the test run type: string required: true wf_category: description: "categories: NIGHTLY, RELEASE" type: string default: NIGHTLY gitref: description: "git commit hash or branch name" type: string default: main workflow_dispatch: inputs: label: description: "requested runner label (specifies instance)" type: string required: true timeout: description: "time limit for run in minutes " type: string required: true run_id: description: "run_id of 'build.yml' run that generated the assets" type: string required: true wheel: description: wheel used for testing type: string required: true run_name: description: name of the test run type: string required: true wf_category: description: "categories: NIGHTLY, RELEASE" type: string default: NIGHTLY gitref: description: "git commit hash or branch name" type: string default: main jobs: REPORT: runs-on: ${{ inputs.label }} timeout-minutes: ${{ fromJson(inputs.timeout) }} permissions: contents: 'read' id-token: 'write' steps: - name: checkout code id: checkout uses: actions/checkout@v4 with: ref: ${{ inputs.gitref }} - name: 'Authenticate to Google Cloud' id: auth-gcp uses: google-github-actions/auth@v2.1.3 with: project_id: ${{ secrets.GCP_PROJECT }} workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_GHA_SA }} - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v2' with: version: '>= 473.0.0' - name: download assets uses: neuralmagic/nm-actions/actions/gcp-download-assets@v1.1.0 with: bucket_source: ${{ secrets.GCP_BUILD_ML_ASSETS2 }} run_id: ${{ inputs.run_id }} - name: generate metadata info id: generate-metadata run: | jq --raw-output -n '{ "hostname": "${{ secrets.REPORTPORTAL_HOST }}", "project": "compressed-tensors", "name": "${{ inputs.run_name }}", "description": "GitHub run: https://github.com/neuralmagic/compressed-tensors/actions/runs/${{ inputs.run_id }}", "attributes": [ {"key": "wheel", "value": "${{ inputs.wheel }}"}, {"key": "gitref", "value": "${{ inputs.gitref }}"}, {"key": "workflow category", "value": "${{ inputs.wf_category }}"} ] }' > metadata.json METADATA_FILEPATH=`realpath metadata.json` echo "metadata_filepath=${METADATA_FILEPATH}" | tee -a $GITHUB_OUTPUT shell: bash - name: report to reportportal uses: neuralmagic/nm-actions/actions/reportportal_submit_execution_results@v1.15.0 with: droute_username: ${{ secrets.DROUTE_USERNAME }} droute_password: ${{ secrets.DROUTE_PASSWORD }} droute_url: ${{ secrets.DROUTE_URL}} metadata_filepath: ${{ steps.generate-metadata.outputs.metadata_filepath }} compressed-tensors-0.9.4/.github/workflows/test-check.yaml000066400000000000000000000012221500222531600236670ustar00rootroot00000000000000name: Run Tests on: push: branches: - main pull_request: branches: - main jobs: python-tests: runs-on: ubuntu-24.04 steps: - uses: actions/setup-python@v4 with: python-version: '3.10' - uses: actions/checkout@v3 - name: Set Env run: | pip3 install --upgrade pip && pip3 install --upgrade setuptools pip3 install virtualenv virtualenv venv source venv/bin/activate - name: "⚙️ Install dependencies" run: pip3 install .[dev,accelerate] - name: "🔬 Running tests" run: make test compressed-tensors-0.9.4/.github/workflows/test.yml000066400000000000000000000103631500222531600224610ustar00rootroot00000000000000name: nightly test on: # makes workflow reusable workflow_call: inputs: gitref: description: "git commit hash or branch name" type: string required: true test_label: description: "requested runner label" type: string required: true python: description: "python version, e.g. 3.10.12" type: string required: true timeout: description: "time limit for run in minutes " type: string required: true whl: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true # makes workflow manually callable workflow_dispatch: inputs: gitref: description: "git commit hash or branch name" type: string required: true test_label: description: "requested runner label" type: string required: true python: description: "python version, e.g. 3.10.12" type: string required: true timeout: description: "time limit for run in minutes " type: string required: true whl: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true jobs: TEST: name: TEST (${{ inputs.python}}, ${{ inputs.test_label }}) runs-on: ${{ inputs.test_label }} timeout-minutes: ${{ fromJson(inputs.timeout) }} permissions: contents: 'read' id-token: 'write' steps: - name: set python id: set_python uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} - name: verify python id: verify_python uses: neuralmagic/nm-actions/actions/verify-python@v1.2.0 with: python-version: ${{ inputs.python }} - name: checkout code id: checkout uses: actions/checkout@v4 with: ref: ${{ inputs.gitref }} - name: create virtualenv id: create_venv uses: neuralmagic/nm-actions/actions/create-virtualenv@v1.2.0 with: venv: TEST - name: download whl id: download uses: actions/download-artifact@v4 with: name: ${{ inputs.whl }} path: ${{ inputs.whl }} - name: run tests id: test uses: ./.github/actions/test/ with: venv: ${{ steps.create_venv.outputs.penv }} suitename: test-${{ inputs.python }}-${{ inputs.test_label }} - name: summary uses: neuralmagic/nm-actions/actions/summary-test@v1.13.0 if: success() || failure() with: test_label: ${{ inputs.test_label }} gitref: ${{ inputs.gitref }} python: ${{ inputs.python }} whl: ${{ inputs.whl }} test_status: ${{ steps.test.outputs.status }} # GCP - name: 'Authenticate to Google Cloud' id: auth uses: google-github-actions/auth@v2.1.3 with: project_id: ${{ secrets.GCP_PROJECT }} workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_GHA_SA }} - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v2' with: version: '>= 473.0.0' - name: copy results to GCP run: | gcloud storage cp test-results/report.xml ${{ secrets.GCP_BUILD_ML_ASSETS2 }}/${{ github.run_id }}/test-results/report-${{ inputs.test_label }}.xml - name: upload results uses: actions/upload-artifact@v4 if: success() || failure() with: name: report-${{ inputs.test_label }}.xml path: test-results/report.xml retention-days: 5 compressed-tensors-0.9.4/.github/workflows/trigger-all.yml000066400000000000000000000031241500222531600237100ustar00rootroot00000000000000name: build and test jobs for nightly and release run-name: ${{ github.actor }} triggered nightly or release on ${{ github.ref }} on: schedule: # * is a special character in YAML so you have to quote this string - cron: '30 0 * * *' # nightly run workflow_dispatch: inputs: wf_category: description: "workflow category, default is NIGHTLY" type: choice options: - NIGHTLY - RELEASE default: NIGHTLY push_to_pypi: description: "when set and tests pass, then '.whl' & '.tar.gz' will be pushed to public pypi" type: boolean default: false gitref: description: "git commit hash or tag name" type: string default: 'main' jobs: BUILD-TEST: uses: ./.github/workflows/build-test.yml name: ${{ inputs.wf_category || 'NIGHTLY' }} with: wf_category: ${{ inputs.wf_category || 'NIGHTLY' }} gitref: ${{ inputs.gitref || 'main' }} push_to_pypi: ${{ (github.event.schedule == '30 0 * * *') || inputs.push_to_pypi || false }} test_configs: '[{"python":"3.11.4","label":"ubuntu-22.04","timeout":"40"}, {"python":"3.10.12","label":"ubuntu-24.04","timeout":"40"}, {"python":"3.9.17","label":"k8s-h100-solo","timeout":"40"}, {"python":"3.12.6","label":"k8s-a100-duo","timeout":"40"}]' secrets: inherit compressed-tensors-0.9.4/.github/workflows/upload.yml000066400000000000000000000137651500222531600227770ustar00rootroot00000000000000name: upload compressed-tensors whl and tar.gz run-name: ${{ github.actor }} uploading whl/tar.gz from run ${{ inputs.run_id }} on: workflow_call: inputs: label: description: "requested runner label (specifies instance)" type: string required: true timeout: description: "time limit for run in minutes " type: string required: true run_id: description: "run_id of 'build.yml' run that generated the assets" type: string required: true push_to_pypi: description: "push asset to public pypi." type: boolean default: false workflow_dispatch: inputs: label: description: "requested runner label (specifies instance)" type: string required: true timeout: description: "time limit for run in minutes " type: string required: true run_id: description: "run_id of 'build.yml' run that generated the assets" type: string required: true push_to_pypi: description: "push asset to public pypi." type: boolean default: false jobs: UPLOAD: runs-on: ${{ inputs.label }} timeout-minutes: ${{ fromJson(inputs.timeout) }} permissions: contents: 'read' id-token: 'write' steps: - name: install automation components uses: neuralmagic/nm-actions/actions/install-automation-components@v1.0.0 - name: set python id: set-python uses: actions/setup-python@v5 with: python-version: 3.10.12 # GCP - name: 'Authenticate to Google Cloud' id: auth-gcp uses: google-github-actions/auth@v2.1.3 with: project_id: ${{ secrets.GCP_PROJECT }} workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.GCP_GHA_SA }} - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v2' with: version: '>= 473.0.0' - name: download assets uses: neuralmagic/nm-actions/actions/gcp-download-assets@v1.1.0 with: bucket_source: ${{ secrets.GCP_BUILD_ML_ASSETS2 }} run_id: ${{ inputs.run_id }} # GCP - name: 'Authenticate to Google Cloud' id: auth-pypi uses: google-github-actions/auth@v2.1.3 with: project_id: ${{ secrets.GCP_PROJECT }} workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} service_account: ${{ secrets.NM_PYPI_SA }} - name: find whl id: find-asset-whl uses: neuralmagic/nm-actions/actions/find-asset@v1.1.0 with: run_id: ${{ inputs.run_id }} asset_identifier: 'compressed*.whl' - name: check if whl is new id: check-whl uses: neuralmagic/nm-actions/actions/check-whl-on-pypi@v1.19.0 with: whl: ${{ steps.find-asset-whl.outputs.asset }} - name: upload whl to "nm-pypi" if: ${{ steps.check-whl.outputs.status == '0' && inputs.push_to_pypi }} uses: neuralmagic/nm-actions/actions/gcp-upload-asset@v1.1.0 with: bucket_target: ${{ secrets.GCP_NM_PYPI_DIST }} asset: ${{ steps.find-asset-whl.outputs.asset }} - name: find tar.gz if: ${{ steps.check-whl.outputs.status == '0' && inputs.push_to_pypi }} id: find-asset-targz uses: neuralmagic/nm-actions/actions/find-asset@v1.1.0 with: run_id: ${{ inputs.run_id }} asset_identifier: 'compressed*.tar.gz' - name: upload tar.gz to "nm-pypi" if: ${{ steps.check-whl.outputs.status =='0' && inputs.push_to_pypi }} uses: neuralmagic/nm-actions/actions/gcp-upload-asset@v1.1.0 with: bucket_target: ${{ secrets.GCP_NM_PYPI_DIST }} asset: ${{ steps.find-asset-targz.outputs.asset }} - name: update "nm-pypi" index if: ${{ steps.check-whl.outputs.status == '0' && inputs.push_to_pypi }} uses: actions/github-script@v6 with: github-token: ${{ secrets.NM_PYPI_WORKFLOW }} script: | const result = await github.rest.actions.createWorkflowDispatch({ owner: 'neuralmagic', repo: 'stratus', workflow_id: 'nm-pypi-update.yml', ref: 'main' }) console.log(result) # publish the wheel file to public pypi - name: push wheel to pypi.org if: ${{ steps.check-whl.outputs.status == '0' && inputs.push_to_pypi }} uses: neuralmagic/nm-actions/actions/publish-whl@v1.0.0 with: username: ${{ secrets.PYPI_PUBLIC_USER }} password: ${{ secrets.PYPI_PUBLIC_AUTH }} whl: ${{ steps.find-asset-whl.outputs.asset }} # publish the tar.gz file to public pypi - name: push wheel to pypi.org if: ${{ steps.check-whl.outputs.status == '0' && inputs.push_to_pypi }} uses: neuralmagic/nm-actions/actions/publish-whl@v1.0.0 with: username: ${{ secrets.PYPI_PUBLIC_USER }} password: ${{ secrets.PYPI_PUBLIC_AUTH }} whl: ${{ steps.find-asset-targz.outputs.asset }} compressed-tensors-0.9.4/.gitignore000066400000000000000000000061031500222531600173470ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST src/compressed_tensors/version.py # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ examples/**/*.safetensors compressed-tensors-0.9.4/LICENSE000066400000000000000000000261351500222531600163730ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. compressed-tensors-0.9.4/Makefile000066400000000000000000000020371500222531600170210ustar00rootroot00000000000000.PHONY: build docs test BUILD_TYPE ?= dev # set nightly to build nightly release PYCHECKDIRS := src tests PYCHECKGLOBS := 'src/**/*.py' 'tests/**/*.py' 'utils/**/*.py' 'examples/**/*.py' setup.py # run checks on all files for the repo quality: @echo "Running copyright checks"; python utils/copyright.py quality $(PYCHECKGLOBS) @echo "Running python quality checks"; black --check $(PYCHECKDIRS); isort --check-only $(PYCHECKDIRS); flake8 $(PYCHECKDIRS); # style the code according to accepted standards for the repo style: @echo "Running copyright style"; python utils/copyright.py style $(PYCHECKGLOBS) @echo "Running python styling"; black $(PYCHECKDIRS); isort $(PYCHECKDIRS); # run tests for the repo test: @echo "Running python tests"; pytest tests; # creates wheel file build: @echo "Building the wheel for the repository"; BUILD_TYPE=$(BUILD_TYPE) python3 setup.py sdist bdist_wheel; # clean package clean: @echo "Cleaning up"; rm -rf .pytest_cache; find $(PYCHECKDIRS) | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf; compressed-tensors-0.9.4/README.md000066400000000000000000000135231500222531600166420ustar00rootroot00000000000000# compressed-tensors The `compressed-tensors` library extends the [safetensors](https://github.com/huggingface/safetensors) format, providing a versatile and efficient way to store and manage compressed tensor data. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more. ## Why `compressed-tensors`? As model compression becomes increasingly important for efficient deployment of LLMs, the landscape of quantization and compression techniques has become increasingly fragmented. Each method often comes with its own storage format and loading procedures, making it challenging to work with multiple techniques or switch between them. `compressed-tensors` addresses this by providing a single, extensible format that can represent a wide variety of compression schemes. * **Unified Checkpoint Format**: Supports various compression schemes in a single, consistent format. * **Wide Compatibility**: Works with popular quantization methods like GPTQ, SmoothQuant, and FP8. See [llm-compressor](https://github.com/vllm-project/llm-compressor) * **Flexible Quantization Support**: * Weight-only quantization (e.g., W4A16, W8A16, WnA16) * Activation quantization (e.g., W8A8) * KV cache quantization * Non-uniform schemes (different layers can be quantized in different ways!) * **Sparsity Support**: Handles both unstructured and semi-structured (e.g., 2:4) sparsity patterns. * **Open-Source Integration**: Designed to work seamlessly with Hugging Face models and PyTorch. This allows developers and researchers to easily experiment with composing different quantization methods, simplify model deployment pipelines, and reduce the overhead of supporting multiple compression formats in inference engines. ## Installation ### From [PyPI](https://pypi.org/project/compressed-tensors) Stable release: ```bash pip install compressed-tensors ``` Nightly release: ```bash pip install --pre compressed-tensors ``` ### From Source ```bash git clone https://github.com/neuralmagic/compressed-tensors cd compressed-tensors pip install -e . ``` ## Getting started ### Saving/Loading Compressed Tensors (Bitmask Compression) The function `save_compressed` uses the `compression_format` argument to apply compression to tensors. The function `load_compressed` reverses the process: converts the compressed weights on disk to decompressed weights in device memory. ```python from compressed_tensors import save_compressed, load_compressed, BitmaskConfig from torch import Tensor from typing import Dict # the example BitmaskConfig method efficiently compresses # tensors with large number of zero entries compression_config = BitmaskConfig() tensors: Dict[str, Tensor] = {"tensor_1": Tensor( [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] )} # compress tensors using BitmaskConfig compression format (save them efficiently on disk) save_compressed(tensors, "model.safetensors", compression_format=compression_config.format) # decompress tensors (load_compressed returns a generator for memory efficiency) decompressed_tensors = {} for tensor_name, tensor in load_compressed("model.safetensors", compression_config = compression_config): decompressed_tensors[tensor_name] = tensor ``` ## Saving/Loading Compressed Models (Bitmask Compression) We can apply bitmask compression to a whole model. For more detailed example see `example` directory. ```python from compressed_tensors import save_compressed_model, load_compressed, BitmaskConfig from transformers import AutoModelForCausalLM model_name = "neuralmagic/llama2.c-stories110M-pruned50" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") original_state_dict = model.state_dict() compression_config = BitmaskConfig() # save compressed model weights save_compressed_model(model, "compressed_model.safetensors", compression_format=compression_config.format) # load compressed model weights (`dict` turns generator into a dictionary) state_dict = dict(load_compressed("compressed_model.safetensors", compression_config)) ``` For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb). ## Saving a Compressed Model with PTQ We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk ```python model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0", torch_dtype="auto") config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json") config.quantization_status = QuantizationStatus.CALIBRATION apply_quantization_config(model, config) dataset = load_dataset("ptb_text_only")["train"] tokenizer = AutoTokenizer.from_pretrained(model_name) def tokenize_function(examples): return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024) tokenized_dataset = dataset.map(tokenize_function, batched=True) data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator()) with torch.no_grad(): for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"): sample = {key: value.to(device) for key,value in sample.items()} _ = model(**sample) if idx >= 512: break model.apply(freeze_module_quantization) model.apply(compress_quantized_weights) output_dir = "./ex_llama1.1b_w4a16_packed_quantize" compressor = ModelCompressor(quantization_config=config) compressed_state_dict = compressor.compress(model) model.save_pretrained(output_dir, state_dict=compressed_state_dict) ``` For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb). compressed-tensors-0.9.4/examples/000077500000000000000000000000001500222531600171755ustar00rootroot00000000000000compressed-tensors-0.9.4/examples/bit_packing/000077500000000000000000000000001500222531600214475ustar00rootroot00000000000000compressed-tensors-0.9.4/examples/bit_packing/ex_quantize_and_pack.py000066400000000000000000000057441500222531600262070ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #### # # The following example shows how to run QDQ inside `compressed-tensors` # QDQ (quantize & de-quantize) is a way to evaluate quantized model # accuracy but will not lead to a runtime speedup. # See `../llama_1.1b/ex_config_quantization.py` to go beyond QDQ # and quantize models that will run more performantly. # #### from pathlib import Path import torch from compressed_tensors.compressors import ModelCompressor from compressed_tensors.quantization import ( QuantizationConfig, QuantizationStatus, apply_quantization_config, ) from datasets import load_dataset from torch.utils.data import DataLoader from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer config_file = Path(__file__).parent / "int4_config.json" model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" dataset_name = "garage-bAInd/Open-Platypus" split = "train" num_calibration_samples = 128 max_seq_length = 512 pad_to_max_length = False output_dir = "./llama1.1b_new_quant_out_test_packing" device = "cuda:0" if torch.cuda.is_available() else "cpu" model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device, torch_dtype="auto" ) model.eval() # no grad or updates needed for base model config = QuantizationConfig.model_validate_json(config_file.read_text()) # set status to calibration config.quantization_status = QuantizationStatus.CALIBRATION # initialize quantization apply_quantization_config(model, config) # create dataset dataset = load_dataset(dataset_name, split=f"train[:{num_calibration_samples}]") tokenizer = AutoTokenizer.from_pretrained(model_name) def tokenize_function(examples): return tokenizer( examples["output"], padding=False, truncation=True, max_length=1024 ) tokenized_dataset = dataset.map(tokenize_function, batched=True) data_loader = DataLoader( tokenized_dataset, batch_size=1, ) with torch.no_grad(): for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"): sample = {k: v.to(model.device) for k, v in sample.items()} _ = model(**sample) if idx >= num_calibration_samples: break # convert model to QDQ model compressor = ModelCompressor(quantization_config=config) compressed_state_dict = compressor.compress(model) # save QDQ model model.save_pretrained(output_dir, state_dict=compressed_state_dict) compressor.update_config(output_dir) compressed-tensors-0.9.4/examples/bit_packing/int4_config.json000066400000000000000000000005641500222531600245520ustar00rootroot00000000000000{ "quant_method": "compressed-tensors", "format": "pack-quantized", "global_compression_ratio": null, "config_groups": { "group_1": { "weights": { "num_bits": 4, "type": "int", "symmetric": false, "strategy": "tensor" }, "targets": ["Linear"] } } }compressed-tensors-0.9.4/examples/bitmask_compression.ipynb000066400000000000000000000214751500222531600243240ustar00rootroot00000000000000{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Bitmask Compression Example ##\n", "\n", "Bitmask compression allows for storing sparse tensors efficiently on the disk. \n", "\n", "Instead of storing each zero element represented as an actual number, we use bitmask to indicate which tensor entries correspond to zero elements. This approach is useful when the matrix is mostly zero values, as it saves space by not wastefully storing those zeros explicitly.\n", "\n", "The example below shows how to save and load sparse tensors using bitmask compression. It also demonstrates the benefits of the bitmask compression over \"dense\" representation, and finally, introduces the enhanced `safetensors` file format for storing sparse weights." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import os\n", "from safetensors import safe_open\n", "from safetensors.torch import save_model\n", "from compressed_tensors import save_compressed_model, load_compressed, BitmaskConfig\n", "from transformers import AutoModelForCausalLM" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32000, 768)\n", " (layers): ModuleList(\n", " (0-11): 12 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=768, out_features=768, bias=False)\n", " (k_proj): Linear(in_features=768, out_features=768, bias=False)\n", " (v_proj): Linear(in_features=768, out_features=768, bias=False)\n", " (o_proj): Linear(in_features=768, out_features=768, bias=False)\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=768, out_features=2048, bias=False)\n", " (up_proj): Linear(in_features=768, out_features=2048, bias=False)\n", " (down_proj): Linear(in_features=2048, out_features=768, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): LlamaRMSNorm((768,), eps=1e-05)\n", " (post_attention_layernorm): LlamaRMSNorm((768,), eps=1e-05)\n", " )\n", " )\n", " (norm): LlamaRMSNorm((768,), eps=1e-05)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (lm_head): Linear(in_features=768, out_features=32000, bias=False)\n", ")" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load a tiny, pruned llama2 model\n", "model_name = \"neuralmagic/llama2.c-stories110M-pruned50\"\n", "model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=\"auto\")\n", "model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The example layer model.layers.0.self_attn.q_proj.weight has sparsity 50%\n" ] } ], "source": [ "# most of the weights of the model are pruned to 50% (except for few layers such as lm_head or embeddings)\n", "state_dict = model.state_dict()\n", "state_dict.keys()\n", "example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n", "print(f\"The example layer {example_layer} has sparsity {100 * state_dict[example_layer].eq(0).sum().item() / state_dict[example_layer].numel():.0f}%\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The model is 32% sparse overall\n" ] } ], "source": [ "# we can inspect to total sparsity of the state_dict\n", "total_num_parameters = 0\n", "total_num_zero_parameters = 0\n", "for key in state_dict:\n", " total_num_parameters += state_dict[key].numel()\n", " total_num_zero_parameters += state_dict[key].eq(0).sum().item()\n", "print(f\"The model is {total_num_zero_parameters/total_num_parameters*100:.0f}% sparse overall\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Compressing model: 100%|██████████| 111/111 [00:00<00:00, 313.39it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Size of the model's weights on disk using safetensors: 417.83 MB\n", "Size of the model's weights on disk using compressed-tensors: 366.82 MB\n", "The compression ratio is x1.14\n" ] } ], "source": [ "# let's save the model on disk using safetensors and compressed-tensors and compare the size on disk\n", "\n", "## save the model using safetensors ##\n", "save_model(model, \"model.safetensors\")\n", "size_on_disk_mb = os.path.getsize('model.safetensors') / 1024 / 1024\n", "\n", "## save the model using compressed-tensors ##\n", "save_compressed_model(model, \"compressed_model.safetensors\", compression_format=\"sparse-bitmask\")\n", "compressed_size_on_disk_mb = os.path.getsize('compressed_model.safetensors') / 1024 / 1024\n", "\n", "print(f\"Size of the model's weights on disk using safetensors: {size_on_disk_mb:.2f} MB\")\n", "print(f\"Size of the model's weights on disk using compressed-tensors: {compressed_size_on_disk_mb:.2f} MB\")\n", "print(\"The compression ratio is x{:.2f}\".format(size_on_disk_mb / compressed_size_on_disk_mb))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Storing weights with around 30% of zero entries requires significantly less disk space when using `compressed-tensors`. The compression ratio improves radically for more sparse models. \n", "\n", "We can load back the `state_dict` from the compressed and uncompressed representation on disk and confirm, that they represent same tensors in memory." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Once loaded, the state_dicts from safetensors and compressed-tensors are equal: True\n" ] } ], "source": [ "# load the safetensor and the compressed-tensor and show that they have the same representation\n", "\n", "## load the uncompressed safetensors to memory ##\n", "state_dict_1 = {}\n", "with safe_open('model.safetensors', framework=\"pt\") as f:\n", " for key in f.keys():\n", " state_dict_1[key] = f.get_tensor(key)\n", "\n", "## load the compressed-tensors to memory ##\n", "config = BitmaskConfig() # we need to specify the method for decompression\n", "state_dict_2 = dict(load_compressed(\"compressed_model.safetensors\", config)) # load_compressed returns a generator, we convert it to a dict\n", "\n", "tensors_equal = all(torch.equal(state_dict_1[key], state_dict_2[key]) for key in state_dict_1)\n", "\n", "print(f\"Once loaded, the state_dicts from safetensors and compressed-tensors are equal: {tensors_equal}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SafeTensors File Format\n", "\n", "The reason why the introduced bitmask compression is much more efficient, is imbibing the information about the compression in the header of the `.safetensors` file.\n", "For each parameter in the uncompressed `state_dict`, we store the following attributes needed for decompression in the compressed `state_dict`:\n", "\n", "* Compressed tensor\n", "* Bitmask\n", "* Uncompressed shape\n", "* Row offsets\n", "\n", "```bash\n", "# Dense\n", "{\n", " PARAM_NAME: uncompressed_tensor\n", "}\n", "\n", "# Compressed\n", "{\n", " PARAM_NAME.compressed: compressed_tensor, # 1d tensor\n", " PARAM_NAME.bitmask: value, # 2d bitmask tensor (nrows x (ncols / 8))\n", " PARAM_NAME.shape: value, # Uncompressed shape tensor\n", " PARAM_NAME.row_offsets: value # 1d offsets tensor\n", "}\n", "```" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 } compressed-tensors-0.9.4/examples/llama_1.1b/000077500000000000000000000000001500222531600210045ustar00rootroot00000000000000compressed-tensors-0.9.4/examples/llama_1.1b/ex_config_quantization.py000066400000000000000000000111051500222531600261230ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #### # # The following example shows how a model can be calibrated and # compressed entirely with primitives within `compressed-tensors` # using PyTorch hooks. # The resulting model's .safetensors file should be 1.2GB, # whereas the original model's .safetensors file is 4.1GB. # See `./ex_llmcompressor_quantization.py` for how this can be # simplified using the vllm's `llm-compressor` package # #### from pathlib import Path import torch from compressed_tensors.compressors import ModelCompressor from compressed_tensors.quantization import ( QuantizationConfig, QuantizationStatus, apply_quantization_config, ) from datasets import load_dataset from torch.utils.data import DataLoader, RandomSampler from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator config_file = Path(__file__).parent / "example_quant_config.json" model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" dataset_name = "garage-bAInd/Open-Platypus" split = "train" num_calibration_samples = 512 max_seq_length = 1024 pad_to_max_length = False output_dir = "./llama1.1b_new_quant_out" device = "cuda:0" if torch.cuda.is_available() else "cpu" model = AutoModelForCausalLM.from_pretrained( model_name, device_map=device, torch_dtype="auto" ) model.eval() # no grad or updates needed for base model config = QuantizationConfig.model_validate_json(config_file.read_text()) # set status to calibration config.quantization_status = QuantizationStatus.CALIBRATION # initialize quantization apply_quantization_config(model, config) # create hook to keep track of scales and zero points on each module with a quantization_scheme def update_scale_zp_hook( module: torch.nn.Module, input: torch.Tensor, _output: torch.Tensor ): from compressed_tensors.quantization.utils import calculate_qparams from compressed_tensors.utils import update_parameter_data quantization_scheme = getattr(module, "quantization_scheme", None) if not quantization_scheme: # no quantization scheme nothing to do return # update weight scale / zero-point quantization_args = getattr(quantization_scheme, "weights", None) min_val, max_val = torch.aminmax(module.weight.data) scale, zp = calculate_qparams(min_val, max_val, quantization_args) update_parameter_data(module, scale, "weight_scale") update_parameter_data(module, zp, "weight_zero_point") # update input_activations scale / zero-point quantization_args = getattr(quantization_scheme, "input_activations", None) min_val, max_val = torch.aminmax(input[0]) scale, zp = calculate_qparams(min_val, max_val, quantization_args) update_parameter_data(module, scale, "input_scale") update_parameter_data(module, zp, "input_zero_point") return # register hook on each submodule in model (recursively) model.apply(lambda module: module.register_forward_hook(update_scale_zp_hook)) # create dataset dataset = load_dataset(dataset_name, split=f"train[:{num_calibration_samples}]") tokenizer = AutoTokenizer.from_pretrained(model_name) def tokenize_function(examples): return tokenizer( examples["output"], padding=False, truncation=True, max_length=1024 ) tokenized_dataset = dataset.map(tokenize_function, batched=True) data_loader = DataLoader( tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator(), sampler=RandomSampler(tokenized_dataset), ) # run calibration, hook will update scales and zero points where applicable with torch.no_grad(): for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"): sample = {k: v.to(model.device) for k, v in sample.items()} _ = model(**sample) if idx >= num_calibration_samples: break # apply compression compressor = ModelCompressor(quantization_config=config) compressed_state_dict = compressor.compress(model) # save quantized model model.save_pretrained(output_dir, state_dict=compressed_state_dict) compressor.update_config(output_dir) compressed-tensors-0.9.4/examples/llama_1.1b/ex_llmcompressor_quantization.py000066400000000000000000000031141500222531600275600ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #### # # The following example shows how the example in `ex_config_quantization.py` # can be done within vllm's llm-compressor project # Be sure to `pip install llmcompressor` before running # See https://github.com/vllm-project/llm-compressor for more information # #### from pathlib import Path import torch from llmcompressor.transformers import oneshot recipe = str(Path(__file__).parent / "example_quant_recipe.yaml") model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" dataset_name = "open_platypus" split = "train" num_calibration_samples = 512 max_seq_length = 1024 pad_to_max_length = False output_dir = "./llama1.1b_llmcompressor_quant_out" device = "cuda:0" if torch.cuda.is_available() else "cpu" oneshot( model=model_name, dataset=dataset_name, output_dir=output_dir, overwrite_output_dir=True, max_seq_length=max_seq_length, num_calibration_samples=num_calibration_samples, recipe=recipe, pad_to_max_length=pad_to_max_length, ) compressed-tensors-0.9.4/examples/llama_1.1b/example_quant_config.json000066400000000000000000000010541500222531600260670ustar00rootroot00000000000000{ "quant_method": "compressed-tensors", "format": "naive-quantized", "global_compression_ratio": null, "config_groups": { "group_1": { "weights": { "num_bits": 8, "type": "int", "symmetric": true, "strategy": "tensor" }, "input_activations": { "num_bits": 8, "type": "int", "symmetric": true, "strategy": "tensor" }, "targets": ["Linear"] } } }compressed-tensors-0.9.4/examples/llama_1.1b/example_quant_recipe.yaml000066400000000000000000000021261500222531600260630ustar00rootroot00000000000000test_stage: quant_modifiers: QuantizationModifier: ignore: - model.layers.0.mlp.down_proj - LlamaRotaryEmbedding - LlamaRMSNorm - SiLU - MatMulLeftInput_QK - MatMulRightInput_QK - MatMulOutput_QK - MatMulLeftInput_PV - MatMulRightInput_PV - MatMulOutput_PV scheme_overrides: Linear: weights: num_bits: 8 symmetric: true strategy: "tensor" input_activations: num_bits: 8 symmetric: false strategy: "tensor" output_activations: null Embedding: weights: num_bits: 8 symmetric: true strategy: "tensor" input_activations: null output_activations: nullcompressed-tensors-0.9.4/examples/quantize_and_pack_int4.ipynb000066400000000000000000000274471500222531600246740ustar00rootroot00000000000000{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## W4A16 Quantization and Compression ##\n", "\n", "Using compressed-tensors, we can compress a quantized model to store it more efficiently on disk.\n", "\n", "In this example, we run post-training quantization (PTQ) to quantize the weights of an example model to 4 bits. We then save a compressed version of the model on disk by packing each group of eight 4-bit weights into a single int32\n", "\n", "By packing groups of eight 4-bit weights into a single int32, we can store a quantized model more efficiently on disk." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import os\n", "from tqdm import tqdm\n", "from compressed_tensors.quantization import (\n", " QuantizationConfig,\n", " QuantizationStatus,\n", " apply_quantization_config,\n", " compress_quantized_weights\n", ")\n", "from compressed_tensors.compressors import ModelCompressor\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator\n", "from datasets import load_dataset\n", "from torch.utils.data import RandomSampler\n", "from torch.utils.data import DataLoader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c883cdc8ecd04866bd01d61796b81c26", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/560 [00:00= num_calibration_samples:\n", " break\n", "\n", "# freeze scale and zero points after calibration\n", "# model.apply(freeze_module_quantization)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After running calibration, each quantized layer will have a new scale and zero_point parameter as shown below.\n", "\n", "Notice that at this point, the weight itself is still a floating point and has not been quantized. \n", "\n", "To convert the weights to an integer type, we need to apply the `compress_quantized_weights` function. After compressing the weights, a forward pass of the model can no longer be run in PyTorch" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Scale: tensor([17296.], device='cuda:4', dtype=torch.float16), Zero Point: tensor([0], device='cuda:4', dtype=torch.int8)\n", "Weight min: -1.587890625 max: 1.0283203125 dtype: torch.float16\n" ] } ], "source": [ "state_dict = model.state_dict()\n", "example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n", "scale = state_dict[example_layer + \"_scale\"]\n", "zero_point = state_dict[example_layer + \"_zero_point\"]\n", "weight = state_dict[example_layer]\n", "print(f\"Scale: {scale}, Zero Point: {zero_point}\")\n", "print(f\"Weight min: {torch.min(weight)} max: {torch.max(weight)} dtype: {weight.dtype}\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Scale: tensor([17296.], device='cuda:4', dtype=torch.float16), Zero Point: tensor([0], device='cuda:4', dtype=torch.int8)\n", "Weight min: 0 max: 0 dtype: torch.int8\n" ] } ], "source": [ "# convert quantized weights to integers\n", "model.apply(compress_quantized_weights)\n", "\n", "state_dict = model.state_dict()\n", "example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n", "scale = state_dict[example_layer + \"_scale\"]\n", "zero_point = state_dict[example_layer + \"_zero_point\"]\n", "weight = state_dict[example_layer]\n", "print(f\"Scale: {scale}, Zero Point: {zero_point}\")\n", "print(f\"Weight min: {torch.min(weight)} max: {torch.max(weight)} dtype: {weight.dtype}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After compressing the quantized model, the weight matrix has a range of int4 but is stored in an int8. \n", "\n", "We can further compress the model on disk using the `pack-quantized` format we specified in the config. This compression format will pack the int4 weights into int32" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Compression format: pack-quantized\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Quantized Compression: 100%|██████████| 509/509 [00:03<00:00, 153.70it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Size of the model's weights on disk using safetensors: 712.23 MB\n" ] } ], "source": [ "# apply compression and save the model to disk\n", "\n", "output_dir = \"./ex_llama1.1b_w4a16_packed_quantize\"\n", "compression_format = config.format\n", "print(f\"Compression format: {compression_format}\")\n", "\n", "compressor = ModelCompressor(quantization_config=config)\n", "compressed_state_dict = compressor.compress(model)\n", "model.save_pretrained(output_dir, state_dict=compressed_state_dict)\n", "compressor.update_config(output_dir)\n", "\n", "compressed_size_on_disk_mb = os.path.getsize(os.path.join(output_dir, \"model.safetensors\")) / 1024 / 1024\n", "print(f\"Size of the model's weights on disk using safetensors: {compressed_size_on_disk_mb:.2f} MB\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 } compressed-tensors-0.9.4/pyproject.toml000066400000000000000000000003611500222531600202730ustar00rootroot00000000000000[build-system] requires = ["setuptools", "wheel", "setuptools_scm>8"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] version_file = "src/compressed_tensors/version.py" [tool.black] line-length = 88 target-version = ['py36'] compressed-tensors-0.9.4/setup.cfg000066400000000000000000000005361500222531600172040ustar00rootroot00000000000000[isort] profile = black default_section = FIRSTPARTY ensure_newline_before_comments = True force_grid_wrap = 0 include_trailing_comma = True sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER line_length = 88 lines_after_imports = 2 multi_line_output = 3 use_parentheses = True [flake8] ignore = E203, E251, E701, W503 max-line-length = 88 compressed-tensors-0.9.4/setup.py000066400000000000000000000100171500222531600170700ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from setuptools import setup, find_packages from typing import List, Dict, Tuple # Set the build type using an environment variable to give us # different package names based on the reason for the build. VALID_BUILD_TYPES = {"release", "nightly", "dev"} BUILD_TYPE = os.environ.get("BUILD_TYPE", "dev") if BUILD_TYPE not in VALID_BUILD_TYPES: raise ValueError( f"Unsupported build type {BUILD_TYPE!r}, must be one of {VALID_BUILD_TYPES}" ) from setuptools_scm import ScmVersion def version_func(version: ScmVersion) -> str: from setuptools_scm.version import guess_next_version if BUILD_TYPE == "nightly": # Nightly builds use alpha versions to ensure they are marked # as pre-releases on pypi.org. return version.format_next_version( guess_next=guess_next_version, fmt="{guessed}.a{node_date:%Y%m%d}", ) if ( BUILD_TYPE == "release" and not version.dirty and (version.exact or version.node is None) ): # When we have a tagged version, use that without modification. return version.format_with("{tag}") # In development mode or when the local repository is dirty, treat # it is as local development version. return version.format_next_version( guess_next=guess_next_version, fmt="{guessed}.dev{distance}", ) def localversion_func(version: ScmVersion) -> str: from setuptools_scm.version import get_local_node_and_date # When we are building nightly versions, we guess the next release # and add the date as an alpha version. We cannot publish packages # with local versions, so we do not add one. if BUILD_TYPE == "nightly": return "" # When we have an exact tag, with no local changes, do not append # anything to the local version field. if ( BUILD_TYPE == "release" and not version.dirty and (version.exact or version.node is None) ): return "" # In development mode or when the local repository is dirty, # return a string that includes the git SHA (node) and a date, # formatted as a local version tag. return get_local_node_and_date(version) def _setup_long_description() -> Tuple[str, str]: return open("README.md", "r", encoding="utf-8").read(), "text/markdown" def _setup_packages() -> List: return find_packages( "src", include=["compressed_tensors", "compressed_tensors.*"], exclude=["*.__pycache__.*"] ) def _setup_install_requires() -> List: return ["torch>=1.7.0", "transformers", "pydantic>=2.0"] def _setup_extras() -> Dict: return { "dev": ["black==22.12.0", "isort==5.8.0", "wheel>=0.36.2", "flake8>=3.8.3", "pytest>=6.0.0", "nbconvert>=7.16.3"], "accelerate": ["accelerate"] } setup( name="compressed-tensors", use_scm_version={ "version_scheme": version_func, "local_scheme": localversion_func, }, author="Neuralmagic, Inc.", author_email="support@neuralmagic.com", license="Apache 2.0", description="Library for utilization of compressed safetensors of neural network models", long_description=_setup_long_description()[0], long_description_content_type=_setup_long_description()[1], url="https://github.com/neuralmagic/compressed-tensors", extras_require=_setup_extras(), install_requires=_setup_install_requires(), package_dir={"": "src"}, packages=_setup_packages(), ) compressed-tensors-0.9.4/src/000077500000000000000000000000001500222531600161465ustar00rootroot00000000000000compressed-tensors-0.9.4/src/__init__.py000066400000000000000000000011511500222531600202550ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/src/compressed_tensors/000077500000000000000000000000001500222531600220675ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/README.md000066400000000000000000000114161500222531600233510ustar00rootroot00000000000000# Save/Load Compressed SafeTensors ## Motivation * Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference * Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance ## SafeTensors File Format For each parameter in the uncompressed state_dict, we store the following attributes needed for decompression in the compressed state_dict: * compressed tensor * bitmask * uncompressed shape * row offsets ```python # dense { PARAM_NAME: uncompressed_tensor } # compressed { PARAM_NAME.compressed: compressed_tensor # 1d tensor PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8)) PARAM_NAME.shape: value # uncompressed shape tensor PARAM_NAME.row_offsets: value # 1d offsets tensor } ``` Config information gets stored in the HF config file ```json // config.json { "sparsity_config": { "format": "sparse_bitmask", // "dense_sparsity" for original tensor format // informational "sparsity_structure": "unstructured", // or 2:4, 8:16 etc... "global_sparsity": "0.5" } } ``` ## Saving/Loading Interface Loading in a compressed model requires no interface changes ```python from transformers import AutoModelForCausalLM # should contain model.safetensors or model.safetensors.index.json model_path = "/PATH/TO/COMPRESSED_MODEL" model = AutoModelForCausalLM.from_pretrained( model_name_or_path=model_path, torch_dtype="auto", **model_kwargs, ) ``` Saving a compressed model with an explicitly provided compression config. The config is saved to the model's `config.json` file. **Note:** the model must have been initialized with AutoModelForCausalLM.from_pretrained() ```python from compressed_tensors import BitmaskConfig output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL" sparsity_config = BitmaskConfig() model.save_pretrained( save_directory=output_dir, sparsity_config=sparsity_config, ) ``` Saving a compressed model, inferring the config from the model attributes ```python model.save_pretrained( save_directory=output_dir, save_compressed=True ) ``` Saving a model in the dense format. If the model has at least 5% global sparsity a sparsity config will still be included in `config.json` with format `dense_sparsity` ```python model.save_pretrained( save_directory=output_dir ) ``` Saving a model in the dense format, bypassing the sparsity config calculation. When the `skip_compression_stats` flag is set, no sparsity config will be written to `config.json` ```python model.save_pretrained( save_directory=output_dir skip_compression_stats=True ) ``` ## Enable Compression During One-Shot and Sparse Finetunining Models that are saved in a supported compressed format on disk will automatically be decompressed when loaded as input to `llmcompressor.transformers.oneshot` or `llmcompressor.transformers.train` To enable compression on save after oneshot or finetuning simply add the `save_compressed=True` argument to `llmcompressor.transformers.oneshot` or `llmcompressor.transformers.train` ```python from llmcompressor.transformers import train train( save_compressed=True, model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4", recipe=RECIPE, dataset=DATASET ) ``` ## Example Code Loads a 60% sparse model, compresses it using the inferred bitmask compression, then reloads the compressed model. ```python import torch from transformers import AutoModelForCausalLM from llmcompressor.utils.pytorch.utils import measure_cuda_memory MODEL_PATH = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60" OUTPUT_PATH = "./test_compress_output" RECIPE = "zoo:llama2-7b-open_platypus_orca_llama2_pretrain-pruned60" torch.cuda.set_device(0) with measure_cuda_memory() as m: model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0", torch_dtype="auto") print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") sparsity_config = getattr(model,"sparsity_config", None) print(f"Sparsity config before compression: {sparsity_config}") with measure_cuda_memory() as m: model.save_pretrained(OUTPUT_PATH, save_compressed=True) print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") torch.cuda.set_device(1) with measure_cuda_memory() as m: model_again = AutoModelForCausalLM.from_pretrained( OUTPUT_PATH, device_map="cuda:1", torch_dtype="auto" ) print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB") sparsity_config = getattr(model_again,"sparsity_config", None) print(f"Sparsity config after compression: {sparsity_config}") ``` compressed-tensors-0.9.4/src/compressed_tensors/__init__.py000066400000000000000000000014541500222531600242040ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .base import * # flake8: noqa from .compressors import * from .config import * from .quantization import QuantizationConfig, QuantizationStatus from .utils import * from .version import * compressed-tensors-0.9.4/src/compressed_tensors/base.py000066400000000000000000000015531500222531600233570ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. SPARSITY_CONFIG_NAME = "sparsity_config" QUANTIZATION_CONFIG_NAME = "quantization_config" COMPRESSION_CONFIG_NAME = "compression_config" KV_CACHE_SCHEME_NAME = "kv_cache_scheme" COMPRESSION_VERSION_NAME = "version" QUANTIZATION_METHOD_NAME = "quant_method" compressed-tensors-0.9.4/src/compressed_tensors/compressors/000077500000000000000000000000001500222531600244465ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/compressors/__init__.py000066400000000000000000000014711500222531600265620ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .base import * from .helpers import * from .model_compressors import * from .quantized_compressors import * from .sparse_compressors import * from .sparse_quantized_compressors import * compressed-tensors-0.9.4/src/compressed_tensors/compressors/base.py000066400000000000000000000160441500222531600257370ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod from typing import Dict, Generator, Optional, Tuple, Union import torch from compressed_tensors.config import SparsityCompressionConfig from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig from compressed_tensors.registry import RegistryMixin from compressed_tensors.utils import has_offloaded_params from torch import Tensor from torch.nn import Module __all__ = ["BaseCompressor"] class BaseCompressor(RegistryMixin, ABC): """ Base class representing a model compression algorithm. Each child class should implement compression_param_info, compress_weight and decompress_weight. Compressors support compressing/decompressing a full module state dict or a single quantized PyTorch leaf module. Model Load Lifecycle (run_compressed=False): - ModelCompressor.decompress() - apply_quantization_config() - BaseCompressor.decompress() Model Save Lifecycle: - ModelCompressor.compress() - BaseCompressor.compress() Module Lifecycle (run_compressed=True): - apply_quantization_config() - compressed_module = CompressedLinear(module) - initialize_module_for_quantization() - BaseCompressor.compression_param_info() - register_parameters() - compressed_module.forward() -compressed_module.decompress() :param config: config specifying compression parameters """ def __init__( self, config: Union[SparsityCompressionConfig, QuantizationConfig, None] = None ): self.config = config def compression_param_info( self, weight_shape: torch.Size, quantization_args: Optional[QuantizationArgs] = None, ) -> Dict[str, Tuple[torch.Size, torch.dtype]]: """ Creates a dictionary of expected shapes and dtypes for each compression parameter used by the compressor :param weight_shape: uncompressed weight shape :param quantization_args: quantization parameters for the weight :return: dictionary mapping compressed parameter names to shape and dtype """ raise NotImplementedError() @property @abstractmethod def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ raise NotImplementedError() @abstractmethod def compress( self, model_state: Dict[str, Tensor], **kwargs, ) -> Dict[str, Tensor]: """ Compresses a dense state dict :param model_state: state dict of uncompressed model :param kwargs: additional arguments for compression :return: compressed state dict """ raise NotImplementedError() @abstractmethod def decompress( self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs, ) -> Generator[Tuple[str, Tensor], None, None]: """ Reads a compressed state dict located at path_to_model_or_tensors and returns a generator for sequentially decompressing back to a dense state dict :param path_to_model_or_tensors: path to compressed safetensors model (directory with one or more safetensors files) or compressed tensors file :param names_to_scheme: quantization args for each quantized weight :param device: optional device to load intermediate weights into :return: compressed state dict """ raise NotImplementedError() def compress_module(self, module: Module) -> Optional[Dict[str, torch.Tensor]]: """ Compresses a single quantized leaf PyTorch module. If the module is not quantized, this function has no effect. :param module: PyTorch module to compress :return: dictionary of compressed weight data, or None if module is not quantized """ if not hasattr(module, "quantization_scheme"): return None # module is not quantized quantization_scheme = module.quantization_scheme if not hasattr(quantization_scheme, "weights"): return None # weights are not quantized quantization_args = quantization_scheme.weights weight = getattr(module, "weight", None) weight_scale = getattr(module, "weight_scale", None) weight_zero_point = getattr(module, "weight_zero_point", None) return self.compress_weight( weight=weight, scale=weight_scale, zero_point=weight_zero_point, quantization_args=quantization_args, ) def compress_weight( self, weight: Tensor, **kwargs, ) -> Dict[str, torch.Tensor]: """ Compresses a single uncompressed weight :param weight: uncompressed weight tensor :param kwargs: additional arguments for compression """ raise NotImplementedError() def decompress_module(self, module: Module): """ Decompresses a single compressed leaf PyTorch module. If the module is not quantized, this function has no effect. :param module: PyTorch module to decompress :return: tensor of the decompressed weight, or None if module is not quantized """ params_device = next(module.parameters()).device device = "cpu" if has_offloaded_params(module) else params_device if not hasattr(module, "quantization_scheme"): return None # module is not quantized quantization_scheme = module.quantization_scheme if not hasattr(quantization_scheme, "weights"): return None # weights are not quantized quantization_args = quantization_scheme.weights compressed_data = {} for name, parameter in module.named_parameters(): compressed_data[name] = parameter return self.decompress_weight( compressed_data=compressed_data, quantization_args=quantization_args ).to(device) def decompress_weight( self, compressed_data: Dict[str, Tensor], **kwargs ) -> torch.Tensor: """ Decompresses a single compressed weight :param compressed_data: dictionary of data needed for decompression :param kwargs: additional arguments for decompression :return: tensor of the decompressed weight """ raise NotImplementedError() compressed-tensors-0.9.4/src/compressed_tensors/compressors/helpers.py000066400000000000000000000124671500222531600264740ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from pathlib import Path from typing import Dict, Generator, Optional, Tuple, Union import torch from compressed_tensors.compressors import BaseCompressor from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig from compressed_tensors.utils.safetensors_load import get_weight_mappings from safetensors import safe_open from safetensors.torch import save_file from torch import Tensor __all__ = [ "load_compressed", "save_compressed", "save_compressed_model", ] def save_compressed( tensors: Dict[str, Tensor], save_path: Union[str, Path], compression_format: Optional[CompressionFormat] = None, ): """ Save compressed tensors to disk. If tensors are not compressed, save them as is. :param tensors: dictionary of tensors to compress :param save_path: path to save compressed tensors :param compression_format: compression format used for the tensors :return: compression config, if tensors were compressed - None otherwise """ if tensors is None or len(tensors) == 0: raise ValueError("No tensors or empty tensors provided to compress") # if no compression_format specified, default to `dense` compression_format = compression_format or CompressionFormat.dense.value if not ( compression_format in BaseCompressor.registered_names() or compression_format in BaseCompressor.registered_aliases() ): raise ValueError( f"Unknown compression format: {compression_format}. " f"Must be one of {set(BaseCompressor.registered_names() + BaseCompressor.registered_aliases())}" # noqa E501 ) # compress compressor = BaseCompressor.load_from_registry(compression_format) # save compressed tensors compressed_tensors = compressor.compress(tensors) save_file(compressed_tensors, save_path) def load_compressed( compressed_tensors: Union[str, Path], compression_config: SparsityCompressionConfig = None, device: Optional[str] = "cpu", ) -> Generator[Tuple[str, Tensor], None, None]: """ Load compressed tensors from disk. If tensors are not compressed, load them as is. :param compressed_tensors: path to compressed tensors. This can be a path to a file or a directory containing one or multiple safetensor files (if multiple - in the format assumed by huggingface) :param compression_config: compression config to use for decompressing tensors. :param device: device to move tensors to. If None, tensors are loaded on CPU. :param return_dict: if True, return a dictionary of decompressed tensors :return a generator that yields the name and tensor of the decompressed tensor """ if compressed_tensors is None or not Path(compressed_tensors).exists(): raise ValueError("No compressed tensors provided to load") if ( compression_config is None or compression_config.format == CompressionFormat.dense.value ): # if no compression_config specified, or `dense` format specified, # assume tensors are not compressed on disk weight_mappings = get_weight_mappings(compressed_tensors) for weight_name, file_with_weight_name in weight_mappings.items(): with safe_open(file_with_weight_name, framework="pt", device=device) as f: weight = f.get_tensor(weight_name) yield weight_name, weight else: # decompress tensors compression_format = compression_config.format compressor = BaseCompressor.load_from_registry( compression_format, config=compression_config ) yield from compressor.decompress(compressed_tensors, device=device) def save_compressed_model( model: torch.nn.Module, filename: str, compression_format: Optional[CompressionFormat] = None, force_contiguous: bool = True, ): """ Wrapper around safetensors `save_model` helper function, which allows for saving compressed model to disk. Note: The model is assumed to have a state_dict with unique entries :param model: model to save on disk :param filename: filename location to save the file :param compression_format: compression format used for the model :param force_contiguous: forcing the state_dict to be saved as contiguous tensors """ state_dict = model.state_dict() if force_contiguous: state_dict = {k: v.contiguous() for k, v in state_dict.items()} try: save_compressed(state_dict, filename, compression_format=compression_format) except ValueError as e: msg = str(e) msg += " Or use save_compressed_model(..., force_contiguous=True), read the docs for potential caveats." # noqa E501 raise ValueError(msg) compressed-tensors-0.9.4/src/compressed_tensors/compressors/model_compressors/000077500000000000000000000000001500222531600302055ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/compressors/model_compressors/__init__.py000066400000000000000000000012321500222531600323140ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .model_compressor import * compressed-tensors-0.9.4/src/compressed_tensors/compressors/model_compressors/model_compressor.py000066400000000000000000000654731500222531600341520ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import operator import os import re from contextlib import contextmanager from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union import compressed_tensors import torch import transformers from compressed_tensors.base import ( COMPRESSION_VERSION_NAME, QUANTIZATION_CONFIG_NAME, QUANTIZATION_METHOD_NAME, SPARSITY_CONFIG_NAME, ) from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.compressors.sparse_compressors import DenseCompressor from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig from compressed_tensors.quantization import ( DEFAULT_QUANTIZATION_METHOD, QuantizationConfig, QuantizationStatus, apply_quantization_config, load_pretrained_quantization_parameters, ) from compressed_tensors.quantization.lifecycle import expand_target_names from compressed_tensors.quantization.quant_args import QuantizationArgs from compressed_tensors.quantization.utils import ( is_module_quantized, iter_named_leaf_modules, ) from compressed_tensors.utils import ( get_safetensors_folder, has_offloaded_params, merge_names, register_offload_parameter, update_parameter_data, ) from compressed_tensors.utils.helpers import ( fix_fsdp_module_name, is_compressed_tensors_config, ) from torch import Tensor from torch.nn import Module from tqdm import tqdm from transformers import AutoConfig from transformers.file_utils import CONFIG_NAME __all__ = ["ModelCompressor", "map_modules_to_quant_args"] _LOGGER: logging.Logger = logging.getLogger(__name__) if TYPE_CHECKING: # dummy type if not available from transformers CompressedTensorsConfig = TypeVar("CompressedTensorsConfig") class ModelCompressor: """ Handles compression and decompression of a model with a sparsity config and/or quantization config. Compression LifeCycle - compressor = ModelCompressor.from_pretrained_model(model) - compressed_state_dict = compressor.compress(model, state_dict) - compressor.quantization_compressor.compress(model, state_dict) - compressor.sparsity_compressor.compress(model, state_dict) - model.save_pretrained(output_dir, state_dict=compressed_state_dict) - compressor.update_config(output_dir) Decompression LifeCycle - compressor = ModelCompressor.from_pretrained(comp_model_path) - model = AutoModel.from_pretrained(comp_model_path) - compressor.decompress(comp_model_path, model) - compressor.sparsity_compressor.decompress(comp_model_path, model) - compressor.quantization_compressor.decompress(comp_model_path, model) :param sparsity_config: config specifying sparsity compression parameters :param quantization_config: config specifying quantization compression parameters """ @classmethod def from_pretrained( cls, pretrained_model_name_or_path: str, **kwargs, ) -> Optional["ModelCompressor"]: """ Given a path to a model config, extract the sparsity and/or quantization configs and load a ModelCompressor :param pretrained_model_name_or_path: path to model config on disk or HF hub :return: compressor for the configs, or None if model is not compressed """ config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) return cls.from_compression_config(compression_config) @classmethod def from_compression_config( cls, compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"], ): """ :param compression_config: A compression or quantization config The type is one of the following: 1. A Dict found under either "quantization_config" or "compression_config" keys in the config.json 2. A CompressedTensorsConfig found under key "quantization_config" in HF model config :return: compressor for the configs, or None if model is not compressed """ if compression_config is None: return None sparsity_config = cls.parse_sparsity_config(compression_config) quantization_config = cls.parse_quantization_config(compression_config) if sparsity_config is None and quantization_config is None: return None if sparsity_config is not None: format = sparsity_config.get("format") sparsity_config = SparsityCompressionConfig.load_from_registry( format, **sparsity_config ) if quantization_config is not None: quantization_config = QuantizationConfig.model_validate(quantization_config) return cls( sparsity_config=sparsity_config, quantization_config=quantization_config ) @classmethod def from_pretrained_model( cls, model: Module, sparsity_config: Union[SparsityCompressionConfig, str, None] = None, quantization_format: Optional[str] = None, ) -> Optional["ModelCompressor"]: """ Given a pytorch model and optional sparsity and/or quantization configs, load the appropriate compressors :param model: pytorch model to target for compression :param sparsity_config: a filled in sparsity config or string corresponding to a sparsity compression algorithm :param quantization_format: string corresponding to a quantization compression algorithm :return: compressor for the configs, or None if model is not compressed """ quantization_config = QuantizationConfig.from_pretrained( model, format=quantization_format ) if isinstance(sparsity_config, str): # we passed in a sparsity format sparsity_config = SparsityCompressionConfig.load_from_registry( sparsity_config ) if sparsity_config is None and quantization_config is None: return None return cls( sparsity_config=sparsity_config, quantization_config=quantization_config ) @staticmethod def parse_sparsity_config( compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"] ) -> Union[Dict[str, Any], None]: """ Parse sparsity config from quantization/compression config. Sparsity config is nested inside q/c config :param compression_config: quantization/compression config :return: sparsity config """ if compression_config is None: return None if is_compressed_tensors_config(compression_config): s_config = compression_config.sparsity_config return s_config.model_dump() if s_config is not None else None return compression_config.get(SPARSITY_CONFIG_NAME, None) @staticmethod def parse_quantization_config( compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"] ) -> Union[Dict[str, Any], None]: """ Parse quantization config from quantization/compression config. The quantization are all the fields that are not the sparsity config or metadata fields :param compression_config: quantization/compression config :return: quantization config without sparsity config or metadata fields """ if compression_config is None: return None if is_compressed_tensors_config(compression_config): q_config = compression_config.quantization_config return q_config.model_dump() if q_config is not None else None quantization_config = deepcopy(compression_config) quantization_config.pop(SPARSITY_CONFIG_NAME, None) # some fields are required, even if a qconfig is not present # pop them off and if nothing remains, then there is no qconfig quant_method = quantization_config.pop(QUANTIZATION_METHOD_NAME, None) _ = quantization_config.pop(COMPRESSION_VERSION_NAME, None) if len(quantization_config) == 0: return None # replace popped off values # note that version is discarded for now if quant_method is not None: quantization_config[QUANTIZATION_METHOD_NAME] = quant_method return quantization_config def __init__( self, sparsity_config: Optional[SparsityCompressionConfig] = None, quantization_config: Optional[QuantizationConfig] = None, ): self.sparsity_config = sparsity_config self.quantization_config = quantization_config self.sparsity_compressor = None self.quantization_compressor = None if sparsity_config is not None: self.sparsity_compressor = BaseCompressor.load_from_registry( sparsity_config.format, config=sparsity_config ) if quantization_config is not None: self.quantization_compressor = BaseCompressor.load_from_registry( quantization_config.format, config=quantization_config ) def get_missing_module_keys(self, model: Module) -> List[str]: """ Identifies the expected missing weight keys in the compressed state_dict. When a model undergoes sparsity or quantization compression, certain weight tensors may be absent from the checkpoint by virtue of compression. This function determines which weight keys are missing based on the applied compression techniques. :param model: The PyTorch model to check for missing keys. :return: A list of missing keys expected in the compressed state_dict. """ missing_keys = set() # Determine missing keys due to sparsity compression if ( self.sparsity_compressor and self.sparsity_config.format != CompressionFormat.dense.value ): sparse_targets = expand_target_names( model=model, targets=self.sparsity_config.targets, ignore=self.sparsity_config.ignore, ) missing_keys.update( merge_names(target, "weight") for target in sparse_targets ) # Determine missing keys due to pack quantization if ( self.quantization_compressor and self.quantization_config.format == CompressionFormat.pack_quantized.value ): for scheme in self.quantization_config.config_groups.values(): quant_targets = expand_target_names( model=model, targets=scheme.targets, ignore=self.quantization_config.ignore, ) missing_keys.update( merge_names(target, "weight") for target in quant_targets ) return list(missing_keys) def get_unexpected_file_keys(self, model: Module) -> List[str]: """ Identifies extra keys introduced by the compression process in the compressed state_dict that are not expected by the model graph. During sparsity or quantization compression, additional metadata or auxiliary parameters may be stored in the checkpoint, which do not correspond to any parameter in the original model. These keys are typically introduced to support the reconstruction of compressed weights. For example, Sparse24Bitmask compression may introduce keys such as 'compressed', 'bitmask', and 'shape' in the checkpoint, which are not part of the original model parameters. :param model: The PyTorch model to check for unexpected keys. :return: A list of extra keys introduced by the compression process that are not expected by the model. """ unexpected_keys = set() # Identify unexpected keys from sparsity compression if ( self.sparsity_compressor and self.sparsity_config.format != CompressionFormat.dense.value ): sparse_targets: Set[str] = expand_target_names( model=model, targets=self.sparsity_config.targets, ignore=self.sparsity_config.ignore, ) unexpected_keys.update( merge_names(target, param) for target in sparse_targets for param in self.sparsity_compressor.compression_param_names ) # Identify unexpected keys from quantization compression if self.quantization_compressor: for scheme in self.quantization_config.config_groups.values(): quant_targets: Set[str] = expand_target_names( model=model, targets=scheme.targets, ignore=self.quantization_config.ignore, ) unexpected_keys.update( merge_names(target, param) for target in quant_targets for param in self.quantization_compressor.compression_param_names if param != "weight" ) return list(unexpected_keys) def compress( self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None ) -> Dict[str, Tensor]: """ Compresses a dense state dict or model with sparsity and/or quantization :param model: uncompressed model to compress :param state_dict: optional uncompressed state_dict to insert into model :return: compressed state dict """ if state_dict is None: state_dict = model.state_dict() compressed_state_dict = state_dict quantized_modules_to_args: Dict[ str, QuantizationArgs ] = map_modules_to_quant_args(model) if self.quantization_compressor is not None: compressed_state_dict = self.quantization_compressor.compress( state_dict, names_to_scheme=quantized_modules_to_args ) if self.quantization_config.format != CompressionFormat.dense.value: self.quantization_config.quantization_status = ( QuantizationStatus.COMPRESSED ) if self.sparsity_compressor is not None: sparse_compression_targets: Set[str] = expand_target_names( model=model, targets=self.sparsity_config.targets, ignore=self.sparsity_config.ignore, ) compressed_state_dict = self.sparsity_compressor.compress( compressed_state_dict, compression_targets=sparse_compression_targets, ) # HACK: Override the dtype_byte_size function in transformers to # support float8 types. Fix is posted upstream # https://github.com/huggingface/transformers/pull/30488 transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size return compressed_state_dict def decompress(self, model_path: str, model: Module): """ Overwrites the weights in model with weights decompressed from model_path :param model_path: path to compressed weights :param model: pytorch model to load decompressed weights into Note: decompress makes use of both _replace_sparsity_weights and _replace_weights The variations in these methods are a result of the subtle variations between the sparsity and quantization compressors. Specifically, quantization compressors return not just the decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity compressors only return the decompressed weight. """ model_path = get_safetensors_folder(model_path) sparse_decompressed = False if ( self.sparsity_compressor is not None and self.sparsity_config.format != CompressionFormat.dense.value ): params_to_ignore = None if self.quantization_compressor is not None: params_to_ignore = self.quantization_compressor.compression_param_names # Sparse decompression is applied on the model_path # The compressor will try and load any quantization parameters as well # params_to_skip_load will skip over quantization params from being loaded dense_gen = self.sparsity_compressor.decompress( model_path, params_to_skip_load=params_to_ignore ) self._replace_sparsity_weights(dense_gen, model) setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config) sparse_decompressed = True if self.quantization_compressor is not None: # Temporarily set quantization status to FROZEN to prevent # quantization during apply_quantization_config. This ensures # that the dtypes of the weights are not unintentionally updated. # The status is restored after quantization params are loaded. with override_quantization_status( self.quantization_config, QuantizationStatus.FROZEN ): names_to_scheme = apply_quantization_config( model, self.quantization_config ) # Load activation scales/zp or any other quantization parameters # Conditionally load the weight quantization parameters if we have a dense compressor # Or if a sparsity compressor has already been applied load_pretrained_quantization_parameters( model, model_path, # TODO: all weight quantization params will be moved to the compressor in a follow-up # including initialization load_weight_quantization=( sparse_decompressed or isinstance(self.quantization_compressor, DenseCompressor) ), ) model_path_or_state_dict = ( model.state_dict() if sparse_decompressed else model_path ) dense_gen = self.quantization_compressor.decompress( model_path_or_state_dict, names_to_scheme=names_to_scheme ) # TODO: all weight quantization params will be moved to the compressor # to prevent duplicate parameter updates in update_parameter_data self._replace_weights(dense_gen, model) def freeze_quantization_status(module): module.quantization_status = QuantizationStatus.FROZEN model.apply(freeze_quantization_status) setattr(model, QUANTIZATION_CONFIG_NAME, self.quantization_config) def update_config(self, save_directory: str): """ Update the model config located at save_directory with compression configs for sparsity and/or quantization :param save_directory: path to a folder containing a HF model config """ if self.quantization_config is None and self.sparsity_config is None: return config_file_path = os.path.join(save_directory, CONFIG_NAME) if not os.path.exists(config_file_path): _LOGGER.warning( f"Could not find a valid model config file in " f"{save_directory}. Compression config will not be saved." ) return with open(config_file_path, "r") as config_file: config_data = json.load(config_file) # required metadata whenever a quantization or sparsity config is present # overwrite previous config and version if already existing config_data[QUANTIZATION_CONFIG_NAME] = {} config_data[QUANTIZATION_CONFIG_NAME][ COMPRESSION_VERSION_NAME ] = compressed_tensors.__version__ if self.quantization_config is not None: self.quantization_config.quant_method = DEFAULT_QUANTIZATION_METHOD else: config_data[QUANTIZATION_CONFIG_NAME][ QUANTIZATION_METHOD_NAME ] = DEFAULT_QUANTIZATION_METHOD # quantization and sparsity configs if self.quantization_config is not None: quant_config_data = self.quantization_config.model_dump() config_data[QUANTIZATION_CONFIG_NAME] = quant_config_data if self.sparsity_config is not None: sparsity_config_data = self.sparsity_config.model_dump() config_data[QUANTIZATION_CONFIG_NAME][ SPARSITY_CONFIG_NAME ] = sparsity_config_data with open(config_file_path, "w") as config_file: json.dump(config_data, config_file, indent=2, sort_keys=True) def _replace_sparsity_weights(self, dense_weight_generator, model: Module): """ Replace the weights of the model with the provided dense weights. This method iterates over the dense_weight_generator and updates the corresponding weights in the model. If a parameter name does not exist in the model, it will be skipped. :param dense_weight_generator (generator): A generator that yields tuples of (name, data), where 'name' is the parameter name and 'data' is the updated param data :param model: The model whose weights are to be updated. """ for name, data in tqdm(dense_weight_generator, desc="Decompressing model"): split_name = name.split(".") prefix, param_name = ".".join(split_name[:-1]), split_name[-1] module = operator.attrgetter(prefix)(model) params_device = next(module.parameters()).device device = "cpu" if has_offloaded_params(module) else params_device delattr(module, param_name) requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16) param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad) register_offload_parameter(module, param_name, param) def _replace_weights(self, dense_weight_generator, model: Module): """ Replace the weights of the model with the provided dense weights. This method iterates over the dense_weight_generator and updates the corresponding weights in the model. If a parameter name does not exist in the model, it will be skipped. :param dense_weight_generator (generator): A generator that yields tuples of (name, data), where 'name' is the parameter name and 'data' is the updated param data :param model: The model whose weights are to be updated. """ for name, data in tqdm(dense_weight_generator, desc="Decompressing model"): module = operator.attrgetter(name)(model) params_device = next(module.parameters()).device device = "cpu" if has_offloaded_params(module) else params_device for param_name, param_data in data.items(): if hasattr(module, param_name): # If compressed, will have an incorrect dtype for transformers >4.49 # TODO: we can also just skip initialization of scales/zp if in decompression in init # to be consistent with loading which happens later as well # however, update_data does a good shape check - should be moved to the compressor if param_name == "weight": delattr(module, param_name) requires_grad = param_data.dtype in ( torch.float16, torch.float32, torch.bfloat16, ) param = torch.nn.Parameter( param_data.to(device), requires_grad=requires_grad ) register_offload_parameter(module, param_name, param) else: # Should already be registered to the correct device for # for scales/zero-points update_parameter_data(module, param_data, param_name) def map_modules_to_quant_args( model: Module, ) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]: """ Given a pytorch model, map out the submodule name (usually linear layers) to the weight QuantizationArgs. If running input activation quantization, will also map to the input QuantizationArgs in a tuple. :param model: pytorch model """ quantized_modules_to_args = {} for name, submodule in iter_named_leaf_modules(model): if is_module_quantized(submodule): if submodule.quantization_scheme.weights is not None: name = fix_fsdp_module_name(name) quantized_modules_to_args[name] = submodule.quantization_scheme.weights if submodule.quantization_scheme.input_activations is not None: weight_args = quantized_modules_to_args.get(name) quantized_modules_to_args[name] = ( weight_args, submodule.quantization_scheme.input_activations, ) return quantized_modules_to_args # HACK: Override the dtype_byte_size function in transformers to support float8 types # Fix is posted upstream https://github.com/huggingface/transformers/pull/30488 def new_dtype_byte_size(dtype): if dtype == torch.bool: return 1 / 8 bit_search = re.search(r"[^\d](\d+)_?", str(dtype)) if bit_search is None: raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") bit_size = int(bit_search.groups()[0]) return bit_size // 8 @contextmanager def override_quantization_status( config: QuantizationConfig, status: QuantizationStatus ): """ Within this context, the quantization status will be set to the supplied status. After the context exits, the original status will be restored. :param config: the quantization config to override :param status: the status to temporarily set """ original_status = config.quantization_status config.quantization_status = status try: yield finally: config.quantization_status = original_status compressed-tensors-0.9.4/src/compressed_tensors/compressors/quantized_compressors/000077500000000000000000000000001500222531600311115ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/compressors/quantized_compressors/__init__.py000066400000000000000000000013121500222531600332170ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .base import * from .naive_quantized import * from .pack_quantized import * compressed-tensors-0.9.4/src/compressed_tensors/compressors/quantized_compressors/base.py000066400000000000000000000217451500222531600324060ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from pathlib import Path from typing import Any, Dict, Generator, Optional, Tuple, Union import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy from compressed_tensors.utils import ( get_nested_mappings_from_state_dict, get_nested_weight_mappings, merge_names, ) from safetensors import safe_open from torch import Tensor from tqdm import tqdm _LOGGER: logging.Logger = logging.getLogger(__name__) __all__ = ["BaseQuantizationCompressor"] class BaseQuantizationCompressor(BaseCompressor): """ Base class representing a quant compression algorithm. Each child class should implement compression_param_info, compress_weight and decompress_weight. Compressors support compressing/decompressing a full module state dict or a single quantized PyTorch leaf module. Model Load Lifecycle (run_compressed=False): - ModelCompressor.decompress() - apply_quantization_config() - BaseQuantizationCompressor.decompress() - BaseQuantizationCompressor.decompress_weight() Model Save Lifecycle: - ModelCompressor.compress() - BaseQuantizationCompressor.compress() - BaseQuantizationCompressor.compress_weight() Module Lifecycle (run_compressed=True): - apply_quantization_config() - compressed_module = CompressedLinear(module) - initialize_module_for_quantization() - BaseQuantizationCompressor.compression_param_info() - register_parameters() - compressed_module.forward() - compressed_module.decompress() :param config: config specifying compression parameters """ def compress( self, model_state: Dict[str, Tensor], names_to_scheme: Dict[str, QuantizationArgs], **kwargs, ) -> Dict[str, Tensor]: """ Compresses a dense state dict :param model_state: state dict of uncompressed model :param names_to_scheme: quantization args for each quantized weight, needed for quantize function to calculate bit depth :return: compressed state dict """ compressed_dict = {} weight_suffix = ".weight" input_zp_suffix = ".input_zero_point" weight_zp_suffix = ".weight_zero_point" _LOGGER.debug( f"Compressing model with {len(model_state)} parameterized layers..." ) for name, value in tqdm(model_state.items(), desc="Quantized Compression"): # check if the parameter we're compressing is the weight zp # or the input zp is_weight_zp = name.endswith(weight_zp_suffix) is_input_zp = name.endswith(input_zp_suffix) # if we're saving the weight zp, fetch weight quant args if is_weight_zp: quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))]) if isinstance(quant_args_zp, tuple): # If tuple, first value is weight args, second is input args quant_args_zp = quant_args_zp[0] # if we're saving the input zp, fetch input quant args if is_input_zp: input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))]) if isinstance(input_args_zp, tuple): # If tuple, first value is weight args, second is input args input_args_zp = input_args_zp[-1] if name.endswith(weight_suffix): prefix = name[: -(len(weight_suffix))] scale = model_state.get(merge_names(prefix, "weight_scale"), None) zp = model_state.get(merge_names(prefix, "weight_zero_point"), None) g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None) if scale is not None: # weight is quantized, compress it if isinstance(names_to_scheme[prefix], tuple): quant_args = names_to_scheme[prefix][0] else: quant_args = names_to_scheme[prefix] compressed_data = self.compress_weight( weight=value, scale=scale, zero_point=zp, g_idx=g_idx, quantization_args=quant_args, device="cpu", ) for key, value in compressed_data.items(): compressed_dict[merge_names(prefix, key)] = value else: compressed_dict[name] = value.to("cpu") # only save zp if asym and not packed zp elif is_weight_zp and ( quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args) ): continue # only save if asym elif is_input_zp and input_args_zp.symmetric: continue elif name.endswith("g_idx") and torch.any(value <= -1): continue else: compressed_dict[name] = value.to("cpu") return compressed_dict def _check_if_zp_pack_quantized(self, quant_args): from compressed_tensors.compressors import PackedQuantizationCompressor if isinstance(self, PackedQuantizationCompressor): if not quant_args.symmetric and quant_args.strategy in [ QuantizationStrategy.GROUP.value, QuantizationStrategy.CHANNEL.value, ]: return True return False def decompress( self, path_to_model_or_tensors: Union[str, Path, Dict[str, Any]], names_to_scheme: Dict[str, QuantizationArgs], device: str = "cpu", ) -> Generator[Tuple[str, Tensor], None, None]: """ Reads a compressed state dict located at path_to_model_or_tensors and returns a generator for sequentially decompressing back to a dense state dict :param path_to_model_or_tensors: path to compressed safetensors model (directory with one or more safetensors files) or compressed tensors file :param names_to_scheme: quantization args for each quantized weight :param device: optional device to load intermediate weights into :return: compressed state dict """ if isinstance(path_to_model_or_tensors, (str, Path)): yield from self._decompress_from_path( path_to_model_or_tensors, names_to_scheme, device ) else: yield from self._decompress_from_state_dict( path_to_model_or_tensors, names_to_scheme ) def _decompress_from_path(self, path_to_model, names_to_scheme, device): weight_mappings = get_nested_weight_mappings( path_to_model, self.compression_param_names ) for weight_name in weight_mappings.keys(): weight_data = {} for param_name, safe_path in weight_mappings[weight_name].items(): full_name = merge_names(weight_name, param_name) with safe_open(safe_path, framework="pt", device=device) as f: weight_data[param_name] = f.get_tensor(full_name) if "weight_scale" in weight_data: quant_args = names_to_scheme[weight_name] decompressed = self.decompress_weight( compressed_data=weight_data, quantization_args=quant_args ) weight_data["weight"] = decompressed yield weight_name, weight_data def _decompress_from_state_dict(self, state_dict, names_to_scheme): weight_mappings = get_nested_mappings_from_state_dict( state_dict, self.compression_param_names ) for weight_name in weight_mappings.keys(): weight_data = {} for param_name, param_value in weight_mappings[weight_name].items(): weight_data[param_name] = param_value if "weight_scale" in weight_data: quant_args = names_to_scheme[weight_name] decompressed = self.decompress_weight( compressed_data=weight_data, quantization_args=quant_args ) weight_data["weight"] = decompressed yield weight_name, weight_data compressed-tensors-0.9.4/src/compressed_tensors/compressors/quantized_compressors/naive_quantized.py000066400000000000000000000120361500222531600346530ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, Optional, Tuple import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.compressors.quantized_compressors.base import ( BaseQuantizationCompressor, ) from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import QuantizationArgs from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize from compressed_tensors.quantization.utils import can_quantize from torch import Tensor __all__ = [ "NaiveQuantizationCompressor", "IntQuantizationCompressor", "FloatQuantizationCompressor", ] @BaseCompressor.register(name=CompressionFormat.naive_quantized.value) class NaiveQuantizationCompressor(BaseQuantizationCompressor): """ Implements naive compression for quantized models. Weight of each quantized layer is converted from its original float type to the closest Pytorch type to the type specified by the layer's QuantizationArgs. """ @property def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ return ( "weight", "weight_scale", "weight_zero_point", "weight_g_idx", ) def compression_param_info( self, weight_shape: torch.Size, quantization_args: Optional[QuantizationArgs] = None, ) -> Dict[str, Tuple[torch.Size, torch.dtype]]: """ Creates a dictionary of expected shapes and dtypes for each compression parameter used by the compressor :param weight_shape: uncompressed weight shape :param quantization_args: quantization parameters for the weight :return: dictionary mapping compressed parameter names to shape and dtype """ dtype = quantization_args.pytorch_dtype() return {"weight": (weight_shape, dtype)} def compress_weight( self, weight: Tensor, scale: Tensor, quantization_args: QuantizationArgs, zero_point: Optional[Tensor] = None, g_idx: Optional[torch.Tensor] = None, device: Optional[torch.device] = None, ) -> Dict[str, torch.Tensor]: """ Compresses a single uncompressed weight :param weight: uncompressed weight tensor :param scale: quantization scale for weight :param quantization_args: quantization parameters for weight :param zero_point: quantization zero point for weight :param g_idx: optional mapping from column index to group index :param device: optional device to move compressed output to :return: dictionary of compressed weight data """ if can_quantize(weight, quantization_args): quantized_weight = quantize( x=weight, scale=scale, zero_point=zero_point, g_idx=g_idx, args=quantization_args, dtype=quantization_args.pytorch_dtype(), ) else: quantized_weight = weight if device is not None: quantized_weight = quantized_weight.to(device) return {"weight": quantized_weight} def decompress_weight( self, compressed_data: Dict[str, Tensor], quantization_args: Optional[QuantizationArgs] = None, ) -> torch.Tensor: """ Decompresses a single compressed weight :param compressed_data: dictionary of data needed for decompression :param quantization_args: quantization parameters for the weight :return: tensor of the decompressed weight """ weight = compressed_data["weight"] scale = compressed_data["weight_scale"] zero_point = compressed_data.get("weight_zero_point", None) g_idx = compressed_data.get("weight_g_idx", None) decompressed_weight = dequantize( x_q=weight, scale=scale, zero_point=zero_point, g_idx=g_idx ) return decompressed_weight @BaseCompressor.register(name=CompressionFormat.int_quantized.value) class IntQuantizationCompressor(NaiveQuantizationCompressor): """ Alias for integer quantized models """ pass @BaseCompressor.register(name=CompressionFormat.float_quantized.value) class FloatQuantizationCompressor(NaiveQuantizationCompressor): """ Alias for fp quantized models """ pass compressed-tensors-0.9.4/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py000066400000000000000000000256321500222531600344750ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math from typing import Dict, Literal, Optional, Tuple, Union import numpy as np import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.compressors.quantized_compressors.base import ( BaseQuantizationCompressor, ) from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize from compressed_tensors.quantization.utils import can_quantize from torch import Tensor __all__ = ["PackedQuantizationCompressor", "pack_to_int32", "unpack_from_int32"] @BaseCompressor.register(name=CompressionFormat.pack_quantized.value) class PackedQuantizationCompressor(BaseQuantizationCompressor): """ Compresses a quantized model by packing every eight 4-bit weights into an int32 """ @property def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ return ( "weight_packed", "weight_scale", "weight_zero_point", "weight_g_idx", "weight_shape", ) def compression_param_info( self, weight_shape: torch.Size, quantization_args: Optional[QuantizationArgs] = None, ) -> Dict[str, Tuple[torch.Size, torch.dtype]]: """ Creates a dictionary of expected shapes and dtypes for each compression parameter used by the compressor :param weight_shape: uncompressed weight shape :param quantization_args: quantization parameters for the weight :return: dictionary mapping compressed parameter names to shape and dtype """ pack_factor = 32 // quantization_args.num_bits packed_size = math.ceil(weight_shape[1] / pack_factor) packed_size_zp = math.ceil(weight_shape[0] / pack_factor) output = { "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32), "weight_shape": (torch.Size((2,)), torch.int32), } if not quantization_args.symmetric and quantization_args.strategy in [ QuantizationStrategy.GROUP.value, QuantizationStrategy.CHANNEL.value, ]: zp_factor = ( quantization_args.group_size if quantization_args.strategy == QuantizationStrategy.GROUP.value else weight_shape[-1] ) output["weight_zero_point"] = ( torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)), torch.int32, ) return output def compress_weight( self, weight: Tensor, scale: Tensor, quantization_args: QuantizationArgs, zero_point: Optional[Tensor] = None, g_idx: Optional[torch.Tensor] = None, device: Optional[torch.device] = None, ) -> Dict[str, torch.Tensor]: """ Compresses a single uncompressed weight :param weight: uncompressed weight tensor :param scale: quantization scale for weight :param quantization_args: quantization parameters for weight :param zero_point: quantization zero point for weight :param g_idx: optional mapping from column index to group index :param device: optional device to move compressed output to :return: dictionary of compressed weight data """ compressed_dict = {} if can_quantize(weight, quantization_args): quantized_weight = quantize( x=weight, scale=scale, zero_point=zero_point, g_idx=g_idx, args=quantization_args, dtype=torch.int8, ) else: quantized_weight = weight packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits) weight_shape = torch.tensor(weight.shape) if device is not None: packed_weight = packed_weight.to(device) weight_shape = weight_shape.to(device) compressed_dict["weight_shape"] = weight_shape compressed_dict["weight_packed"] = packed_weight # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp if not quantization_args.symmetric and quantization_args.strategy in [ QuantizationStrategy.GROUP.value, QuantizationStrategy.CHANNEL.value, ]: packed_zp = pack_to_int32( zero_point, quantization_args.num_bits, packed_dim=0 ) compressed_dict["weight_zero_point"] = packed_zp return compressed_dict def decompress_weight( self, compressed_data: Dict[str, Tensor], quantization_args: Optional[QuantizationArgs] = None, ) -> torch.Tensor: """ Decompresses a single compressed weight :param compressed_data: dictionary of data needed for decompression :param quantization_args: quantization parameters for the weight :return: tensor of the decompressed weight """ weight = compressed_data["weight_packed"] scale = compressed_data["weight_scale"] zero_point = compressed_data.get("weight_zero_point", None) g_idx = compressed_data.get("weight_g_idx", None) original_shape = torch.Size(compressed_data["weight_shape"]) num_bits = quantization_args.num_bits unpacked = unpack_from_int32(weight, num_bits, original_shape) # NOTE: this will fail decompression as we don't currently handle packed zp on decompression if not quantization_args.symmetric and quantization_args.strategy in [ QuantizationStrategy.GROUP.value, QuantizationStrategy.CHANNEL.value, ]: raise ValueError( "Decompression of packed zero points is currently not supported" ) assert zero_point is not None original_zp_shape = (original_shape[0], scale.shape[-1]) zero_point = unpack_from_int32( zero_point, num_bits, original_zp_shape, packed_dim=0 ) decompressed_weight = dequantize( x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx ) return decompressed_weight def pack_to_int32( value: torch.Tensor, num_bits: int, packed_dim: Union[Literal[0], Literal[1]] = 1, ) -> torch.Tensor: """ Packs a tensor of quantized weights stored in int8 into int32s with padding Pseudocode: 1. Shift wrt num_bits to convert to unsigned. num_bits=8 [1,2] -> [129, 130] 2. Pad to fill in 32 bits [129, 130] -> [129, 130, 0, 0] 3. convert to binary align in order [129, 130, 0, 0] -> 00000000 00000000 10000010 10000001 4. convert aligned binary to number 00000000000000001000001010000001 -> 33409 5. covert back to uint32 33409 -> 33409 :param value: tensor to pack :param num_bits: number of bits used to store underlying data, must be at least 1 :returns: packed int32 tensor """ if value.dtype is not torch.int8: raise ValueError("Tensor must be quantized to torch.int8 before packing") if num_bits > 8: raise ValueError("Packing is only supported for less than 8 bits") if num_bits < 1: raise ValueError(f"num_bits must be at least 1, got {num_bits}") # convert to unsigned for packing offset = 1 << (num_bits - 1) value = (value + offset).to(torch.uint8) value = value.cpu().numpy().astype(np.uint32) pack_factor = 32 // num_bits # pad input tensor and initialize packed output packed_size = math.ceil(value.shape[packed_dim] / pack_factor) padding = packed_size * pack_factor - value.shape[packed_dim] value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0) # pack values if packed_dim == 1: packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32) for i in range(pack_factor): packed |= value[:, i::pack_factor] << num_bits * i else: packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32) for i in range(pack_factor): packed |= value[i::pack_factor, :] << num_bits * i # convert back to signed and torch packed = np.ascontiguousarray(packed).view(np.int32) return torch.from_numpy(packed) def unpack_from_int32( value: torch.Tensor, num_bits: int, shape: torch.Size, packed_dim: Union[Literal[0], Literal[1]] = 1, ) -> torch.Tensor: """ Unpacks a tensor of packed int32 weights into individual int8s, maintaining the original bit range. Return tensors in int8 :param value: tensor to upack :param num_bits: number of bits to unpack each data point into :param shape: shape to unpack into, used to remove padding :returns: unpacked int8 tensor """ if value.dtype is not torch.int32: raise ValueError( f"Expected {torch.int32} but got {value.dtype}, Aborting unpack." ) if num_bits > 8: raise ValueError("Unpacking is only supported for less than 8 bits") pack_factor = 32 // num_bits # unpack mask = (1 << num_bits) - 1 if packed_dim == 1: unpacked = torch.zeros( (value.shape[0], value.shape[1] * pack_factor), device=value.device, dtype=torch.int32, ) for i in range(pack_factor): unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask # remove padding original_row_size = int(shape[1]) unpacked = unpacked[:, :original_row_size] else: unpacked = torch.zeros( (value.shape[0] * pack_factor, value.shape[1]), device=value.device, dtype=torch.int32, ) for i in range(pack_factor): unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask # remove padding original_row_size = int(shape[0]) unpacked = unpacked[:original_row_size, :] # bits are packed in unsigned format, reformat to signed # update the value range from unsigned to signed offset = pow(2, num_bits) // 2 unpacked = (unpacked - offset).to(torch.int8) return unpacked compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_compressors/000077500000000000000000000000001500222531600304025ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_compressors/__init__.py000066400000000000000000000013411500222531600325120ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .base import * from .dense import * from .sparse_24_bitmask import * from .sparse_bitmask import * compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_compressors/base.py000066400000000000000000000147771500222531600317060ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from typing import Dict, Generator, Optional, Set, Tuple from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.utils import get_nested_weight_mappings, merge_names from safetensors import safe_open from torch import Tensor from tqdm import tqdm __all__ = ["BaseSparseCompressor"] _LOGGER: logging.Logger = logging.getLogger(__name__) class BaseSparseCompressor(BaseCompressor): """ Base class representing a sparse compression algorithm. Each child class should implement compression_param_names, compress_weight and decompress_weight; Compressors support compressing/decompressing a full module state dict or a single quantized PyTorch leaf module. Model Load Lifecycle (run_compressed=False): - ModelCompressor.decompress() - apply_quantization_config() - BaseSparseCompressor.decompress() - BaseSparseCompressor.decompress_weight() Model Save Lifecycle: - ModelCompressor.compress() - BaseSparseCompressor.compress() - BaseSparseCompressor.compress_weight() Module Lifecycle (run_compressed=True): - apply_quantization_config() - compressed_module = CompressedLinear(module) - initialize_module_for_quantization() - BaseSparseCompressor.compression_param_info() - register_parameters() - compressed_module.forward() - compressed_module.decompress() :param config: config specifying compression parameters """ def compress( self, model_state: Dict[str, Tensor], compression_targets: Optional[Set[str]] = None, ) -> Dict[str, Tensor]: """ Compresses a dense state dict using bitmask compression :param model_state: state dict of uncompressed model :param compression_targets: optional set of layer prefixes to compress, otherwise compress all layers (for backwards compatibility) :return: compressed state dict """ compressed_dict = {} _LOGGER.debug( f"Compressing model with {len(model_state)} parameterized layers..." ) for name, value in tqdm(model_state.items(), desc="Compressing model"): if not self.should_compress(name, compression_targets): compressed_dict[name] = value continue prefix = name if prefix.endswith(".weight"): prefix = prefix[: -(len(".weight"))] compression_data = self.compress_weight(prefix, value) for key in compression_data.keys(): if key in compressed_dict: _LOGGER.warn( f"Expected all compressed state_dict keys to be unique, but " f"found an existing entry for {key}. The existing entry will " "be replaced." ) compressed_dict.update(compression_data) return compressed_dict def decompress( self, path_to_model_or_tensors: str, device: str = "cpu", params_to_skip_load: Optional[Tuple] = None, **kwargs, ) -> Generator[Tuple[str, Tensor], None, None]: """ Reads a bitmask compressed state dict located at path_to_model_or_tensors and returns a generator for sequentially decompressing back to a dense state dict :param model_path: path to compressed safetensors model (directory with one or more safetensors files) or compressed tensors file :param device: device to load decompressed weights onto :param params_to_skip_load: a list of non-sparsity parameters (e.g quantization parameters) that we want to skip loading. As the sparsity compresssor does not handle quantized decompression, this should contain any quantization parameters when decompressing stacked compressors. We want these parameters to be handled by the quantization decompressor :return: iterator for generating decompressed weights """ weight_mappings, ignored_params = get_nested_weight_mappings( path_to_model_or_tensors, self.compression_param_names, return_unmatched_params=True, ) for weight_name in weight_mappings.keys(): weight_data = {} for param_name, safe_path in weight_mappings[weight_name].items(): full_name = merge_names(weight_name, param_name) with safe_open(safe_path, framework="pt", device=device) as f: weight_data[param_name] = f.get_tensor(full_name) decompressed = self.decompress_weight(weight_data) yield merge_names(weight_name, "weight"), decompressed for ignored_param_name, safe_path in ignored_params.items(): should_skip = False if params_to_skip_load is not None: for param_to_skip in params_to_skip_load: if param_to_skip in ignored_param_name: should_skip = True if not should_skip: with safe_open(safe_path, framework="pt", device=device) as f: value = f.get_tensor(ignored_param_name) yield ignored_param_name, value @staticmethod def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool: """ Check if a parameter should be compressed. Currently, this only returns True for weight parameters. :param name: name of the parameter :param expanded_targets: set of layer prefixes to compress :return: whether or not the parameter should be compressed """ if expanded_targets is None: return name.endswith(".weight") return ( name.endswith(".weight") and name[: -(len(".weight"))] in expanded_targets ) compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_compressors/dense.py000066400000000000000000000027411500222531600320560ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, Generator, Tuple from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.config import CompressionFormat from torch import Tensor @BaseCompressor.register(name=CompressionFormat.dense.value) class DenseCompressor(BaseCompressor): """ Identity compressor for dense models, returns the original state_dict """ @property def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ return () def compress(self, model_state: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]: return model_state def decompress( self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs ) -> Generator[Tuple[str, Tensor], None, None]: return iter([]) compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_compressors/sparse_24_bitmask.py000066400000000000000000000211471500222531600342750ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass from typing import Dict, List, Tuple, Union import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.compressors.sparse_compressors.base import BaseSparseCompressor from compressed_tensors.config import CompressionFormat, SparsityStructure from compressed_tensors.quantization import FP8_DTYPE from compressed_tensors.utils import merge_names, pack_bitmasks, unpack_bitmasks from torch import Tensor __all__ = [ "Sparse24BitMaskCompressor", "Sparse24BitMaskTensor", "sparse24_bitmask_compress", "sparse24_bitmask_decompress", "get_24_bytemasks", ] @BaseCompressor.register(name=CompressionFormat.sparse_24_bitmask.value) class Sparse24BitMaskCompressor(BaseSparseCompressor): """ Compression for sparse models using bitmasks. Non-zero weights are stored in a 2d values tensor, with their locations stored in a 2d bitmask """ @property def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ return ( "shape", "compressed", "bitmask", ) def compress_weight(self, name, value): bitmask_tensor = Sparse24BitMaskTensor.from_dense( value, self.config.sparsity_structure ) bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu") return bitmask_dict def decompress_weight(self, weight_data): data = Sparse24BitMaskTensor.from_compressed_data(**weight_data) decompressed = data.decompress() return decompressed @dataclass class Sparse24BitMaskTensor: """ Owns compressions and decompression for a single 2:4 sparse bitmask compressed tensor. :param shape: shape of dense tensor :param compressed: 2d tensor of non-zero values :param bitmask: 2d bitmask of non-zero values """ shape: List[int] compressed: Tensor bitmask: Tensor @staticmethod def from_dense( tensor: Tensor, sparsity_structure: Union[SparsityStructure, str] = SparsityStructure.TWO_FOUR, ) -> "Sparse24BitMaskTensor": """ :param tensor: dense tensor to compress :return: instantiated compressed tensor """ shape = list(tensor.shape) compressed, bitmask = sparse24_bitmask_compress( tensor.cpu(), sparsity_structure=sparsity_structure ) return Sparse24BitMaskTensor( shape=shape, compressed=compressed, bitmask=bitmask, ) @staticmethod def from_compressed_data( shape: Union[List[int], Tensor], compressed: Tensor, bitmask: Tensor ) -> "Sparse24BitMaskTensor": """ :param shape: shape of the dense tensor (can be a list or a tensor) :param compressed: 2d tensor of non-zero values :param bitmask: 2d bitmask of non-zero values :return: instantiated Sparse24BitMaskTensor """ if isinstance(shape, list): shape = torch.tensor(shape) if isinstance(shape, torch.Tensor): shape = shape.flatten().tolist() return Sparse24BitMaskTensor( shape=shape, compressed=compressed, bitmask=bitmask ) def decompress(self) -> Tensor: """ :return: reconstructed dense tensor """ return sparse24_bitmask_decompress(self.compressed, self.bitmask, self.shape) def curr_memory_size_bytes(self) -> int: """ :return: size in bytes required to store compressed tensor on disk """ def sizeof_tensor(a: Tensor) -> int: return a.element_size() * a.nelement() return sizeof_tensor(self.compressed) + sizeof_tensor(self.bitmask) def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]: """ :param name_prefix: name of original tensor to store compressed weight as :return: dict of compressed data for the stored weight """ if name_prefix.endswith(".weight"): name_prefix = name_prefix[: -len(".weight")] return { merge_names(name_prefix, "shape"): torch.tensor( self.shape, device=device ).reshape(-1, 1), merge_names(name_prefix, "compressed"): self.compressed.to(device), merge_names(name_prefix, "bitmask"): self.bitmask.to(device), } def __repr__(self) -> str: return f"BitMaskTensor(shape={self.shape}, compressed=True)" def sparse24_bitmask_compress( tensor: Tensor, sparsity_structure: Union[SparsityStructure, str] = SparsityStructure.TWO_FOUR, ) -> Tuple[Tensor, Tensor, Tensor]: """ Compresses a dense tensor using bitmask compression :param tensor: dense 2D tensor to compress :param sparsity_structure: structure of sparsity in the tensor, defaults to unstructured, can also be set to `2:4` :return: tuple of compressed data representing tensor """ assert len(tensor.shape) == 2, "Only 2D tensors are supported" assert ( SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR ), "Only 2:4 sparsity is supported" bytemasks = get_24_bytemasks(tensor=tensor) if tensor.dtype == FP8_DTYPE: # acces raw bytes of the tensor tensor_view = tensor.view(torch.int8) values = tensor_view[bytemasks] values = values.view(FP8_DTYPE) else: values = tensor[bytemasks] num_rows, num_cols = tensor.shape compressed_values = values.reshape(num_rows, num_cols // 2) bitmasks_packed = pack_bitmasks(bytemasks) return compressed_values, bitmasks_packed def sparse24_bitmask_decompress( values: Tensor, bitmasks: Tensor, original_shape: torch.Size ) -> Tensor: """ Reconstructs a dense tensor from a compressed one :param values: 1d tensor of non-zero values :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the tensors original shape :param original_shape: shape of the dense tensor :return: decompressed dense tensor """ bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape) decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype) decompressed_tensor = decompressed_tensor.to(values.device) values = values.flatten() if decompressed_tensor.dtype == FP8_DTYPE: decompressed_tensor[bytemasks_unpacked] = values decompressed_tensor = decompressed_tensor.cuda() else: decompressed_tensor[bytemasks_unpacked] = values return decompressed_tensor def get_24_bytemasks(tensor): """ Generate a 2:4 sparsity mask for the given tensor. This function creates a mask where exactly 2 out of every 4 elements are preserved based on their magnitudes. The preserved elements are the ones with the highest absolute values in each group of 4 elements. :param tensor: The input tensor for which the 2:4 sparsity mask is to be created. The tensor can be of any shape but its total number of elements must be a multiple of 4. :return: A boolean tensor of the same shape as the input tensor, where `True` indicates the preserved elements and `False` indicates the pruned elements. :raises ValueError: If the total number of elements in the tensor is not a multiple of 4. """ original_dtype = tensor.dtype if tensor.dtype == FP8_DTYPE: tensor = tensor.view(torch.int8) original_shape = tensor.shape num_elements = tensor.numel() if num_elements % 4 != 0: raise ValueError("Tensor size must be a multiple of 4 for TWO_FOUR sparsity") reshaped_tensor = tensor.view(-1, 4) abs_tensor = reshaped_tensor.abs() topk_indices = abs_tensor.topk(2, dim=1).indices mask = torch.zeros_like(reshaped_tensor, dtype=torch.bool) mask.scatter_(1, topk_indices, True) mask = mask.view(original_shape) tensor = tensor.view(original_dtype) return mask compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_compressors/sparse_bitmask.py000066400000000000000000000132741500222531600337720ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, List, Tuple, Union import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.compressors.sparse_compressors.base import BaseSparseCompressor from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import FP8_DTYPE from compressed_tensors.utils import merge_names, pack_bitmasks, unpack_bitmasks from torch import Tensor __all__ = [ "BitmaskCompressor", "BitmaskTensor", "bitmask_compress", "bitmask_decompress", ] @BaseCompressor.register(name=CompressionFormat.sparse_bitmask.value) class BitmaskCompressor(BaseSparseCompressor): """ Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d values tensor, with their locations stored in a 2d bitmask """ @property def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ return ("shape", "compressed", "bitmask", "row_offsets") def compress_weight(self, name, value): bitmask_tensor = BitmaskTensor.from_dense(value) bitmask_dict = bitmask_tensor.dict(name_prefix=name, device="cpu") return bitmask_dict def decompress_weight(self, weight_data): data = BitmaskTensor(**weight_data) decompressed = data.decompress() return decompressed class BitmaskTensor: """ Owns compressions and decompression for a single bitmask compressed tensor. Adapted from: https://github.com/mgoin/torch_bitmask/tree/main :param shape: shape of dense tensor :compressed: flat tensor of non-zero values :bitmask: 2d bitmask of non-zero values :row_offsets: flat tensor indicating what index in values each dense row starts at """ def __init__( self, shape: Union[torch.Size, List], compressed: Tensor, bitmask: Tensor, row_offsets: Tensor, ): self.shape = list(shape) self.compressed = compressed self.bitmask = bitmask self.row_offsets = row_offsets @staticmethod def from_dense(tensor: Tensor) -> "BitmaskTensor": """ :param tensor: dense tensor to compress :return: instantiated compressed tensor """ shape = tensor.shape compressed, bitmask, row_offsets = bitmask_compress(tensor.cpu()) return BitmaskTensor( shape=shape, compressed=compressed, bitmask=bitmask, row_offsets=row_offsets ) def decompress(self) -> Tensor: """ :return: reconstructed dense tensor """ return bitmask_decompress(self.compressed, self.bitmask, self.shape) def curr_memory_size_bytes(self): """ :return: size in bytes required to store compressed tensor on disk """ def sizeof_tensor(a): return a.element_size() * a.nelement() return ( sizeof_tensor(self.compressed) + sizeof_tensor(self.bitmask) + sizeof_tensor(self.row_offsets) ) def dict(self, name_prefix: str, device: str = "cpu") -> Dict[str, Tensor]: """ :name_prefix: name of original tensor to store compressed weight as :return: dict of compressed data for the stored weight """ return { merge_names(name_prefix, "shape"): torch.tensor(self.shape, device=device), merge_names(name_prefix, "compressed"): self.compressed.to(device), merge_names(name_prefix, "bitmask"): self.bitmask.to(device), merge_names(name_prefix, "row_offsets"): self.row_offsets.to(device), } def __repr__(self): return f"BitmaskTensor(shape={self.shape}, compressed=True)" def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]: """ Compresses a dense tensor using bitmask compression :param tensor: dense tensor to compress :return: tuple of compressed data representing tensor """ bytemasks = tensor != 0 row_counts = bytemasks.sum(dim=-1) row_offsets = torch.cumsum(row_counts, 0) - row_counts if tensor.dtype == FP8_DTYPE: # acces raw bytes of the tensor tensor_view = tensor.view(torch.int8) values = tensor_view[bytemasks] values = values.view(FP8_DTYPE) else: values = tensor[bytemasks] bitmasks_packed = pack_bitmasks(bytemasks) return values, bitmasks_packed, row_offsets def bitmask_decompress( values: Tensor, bitmasks: Tensor, original_shape: torch.Size ) -> Tensor: """ Reconstructs a dense tensor from a compressed one :param values: 1d tensor of non-zero values :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the tensors original shape :param original_shape: shape of the dense tensor :return: decompressed dense tensor """ bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape) decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype) decompressed_tensor[bytemasks_unpacked] = values return decompressed_tensor compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_quantized_compressors/000077500000000000000000000000001500222531600324665ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_quantized_compressors/__init__.py000066400000000000000000000012431500222531600345770ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .marlin_24 import Marlin24Compressor marlin_24.py000066400000000000000000000226161500222531600345570ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/compressors/sparse_quantized_compressors# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from typing import Dict, Generator, Tuple import numpy as np import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy from compressed_tensors.quantization.lifecycle.forward import quantize from compressed_tensors.utils import ( get_permutations_24, is_quantization_param, merge_names, sparse_semi_structured_from_dense_cutlass, tensor_follows_mask_structure, ) from torch import Tensor from tqdm import tqdm _LOGGER: logging.Logger = logging.getLogger(__name__) @BaseCompressor.register(name=CompressionFormat.marlin_24.value) class Marlin24Compressor(BaseCompressor): """ Compresses a quantized model with 2:4 sparsity structure for inference with the Marlin24 kernel. Decompression is not implemented for this compressor. """ @staticmethod def validate_quant_compatability( model_quant_args: Dict[str, QuantizationArgs] ) -> bool: """ Checks if every quantized module in the model is compatible with Marlin24 compression. Quantization must be channel or group strategy with group_size of 128. Only symmetric quantization is supported :param model_quant_args: dictionary of mapping module names to their quantization configuration :return: True if all modules are compatible with Marlin24 compression, raises a ValueError otherwise """ for name, quant_args in model_quant_args.items(): strategy = quant_args.strategy group_size = quant_args.group_size symmetric = quant_args.symmetric if ( strategy is not QuantizationStrategy.GROUP.value and strategy is not QuantizationStrategy.CHANNEL.value ): raise ValueError( f"Marlin24 Compressor is only valid for group and channel " f"quantization strategies, got {strategy} in {name}" ) if group_size is not None and group_size != 128: raise ValueError( f"Marlin24 Compressor is only valid for group size 128, " f"got {group_size} in {name}" ) if not symmetric: raise ValueError( f"Marlin24 Compressor is only valid for symmetric quantzation, " f"got symmetric={symmetric} in {name}" ) return True @staticmethod def validate_sparsity_structure(name: str, weight: Tensor) -> bool: """ Checks if a tensor fits the required 2:4 sparsity structure :param name: name of the tensor to check :param weight: tensor to check for sparsity structure :return: True if all rows match the 2:4 sparsity structure, raises ValueError otherwise """ if not tensor_follows_mask_structure(weight): raise ValueError( "Marlin24 Compressor is only compatible with weights that have " f"a 2:4 sparsity structure. Found segments in {name} " "that do not match the expected structure." ) return True @property def compression_param_names(self) -> Tuple[str]: """ Returns a tuple of compression parameter names introduced by the compressor during compression """ return ("weight_packed", "scale_packed", "meta") def compress( self, model_state: Dict[str, Tensor], names_to_scheme: Dict[str, QuantizationArgs], **kwargs, ) -> Dict[str, Tensor]: """ Compresses a quantized state_dict with 2:4 sparsity structure for inference with the Marlin24 kernel :param model_state: state dict of uncompressed model :param names_to_scheme: quantization args for each quantized weight, needed for quantize function to calculate bit depth :return: compressed state dict """ self.validate_quant_compatability(names_to_scheme) compressed_dict = {} weight_suffix = ".weight" _LOGGER.debug( f"Compressing model with {len(model_state)} parameterized layers..." ) for name, value in tqdm(model_state.items(), desc="Compressing model"): if name.endswith(weight_suffix): prefix = name[: -(len(weight_suffix))] scale = model_state.get(merge_names(prefix, "weight_scale"), None) zp = model_state.get(merge_names(prefix, "weight_zero_point"), None) if scale is not None: # weight is quantized, compress it # Marlin24 kernel requires float16 inputs scale = scale.to(torch.float16) value = value.to(torch.float16) # quantize weight, keeping it as a float16 for now quant_args = names_to_scheme[prefix] value = quantize( x=value, scale=scale, zero_point=zp, args=quant_args ) # compress based on sparsity structure self.validate_sparsity_structure(prefix, value) value, meta = compress_weight_24(value) meta = meta.cpu() # Marlin24 kernel expects input dim first value = value.t().contiguous().cpu() scale = scale.t().contiguous().cpu() og_weight_shape = value.shape # Marlin24 kernel expects unsigned values, shift zero-point value += (1 << quant_args.num_bits) // 2 # pack quantized weight and scale value = pack_weight_24(value, quant_args) packed_scale = pack_scales_24(scale, quant_args, og_weight_shape) meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2) # save compressed values compressed_dict[merge_names(prefix, "scale_packed")] = packed_scale compressed_dict[merge_names(prefix, "weight_packed")] = value compressed_dict[merge_names(prefix, "meta")] = meta continue if not is_quantization_param(name): # export unquantized parameters without modifying compressed_dict[name] = value.to("cpu") return compressed_dict def decompress( self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs ) -> Generator[Tuple[str, Tensor], None, None]: raise NotImplementedError( "Decompression is not implemented for the Marlin24 Compressor." ) def compress_weight_24(weight: Tensor): weight = weight.contiguous() w_comp, meta = sparse_semi_structured_from_dense_cutlass(weight) w_comp = w_comp.contiguous() return w_comp, meta def marlin_permute_weights(q_w, size_k, size_n, perm, tile): assert q_w.shape == (size_k, size_n) assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}" assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}" # Permute weights to 16x64 marlin tiles q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile)) q_w = q_w.permute((0, 2, 1, 3)) q_w = q_w.reshape((size_k // tile, size_n * tile)) q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape) return q_w def pack_weight_24( weight: Tensor, quantization_args: QuantizationArgs, tile: int = 16, ): size_k = weight.shape[0] size_n = weight.shape[1] num_bits = quantization_args.num_bits pack_factor = 32 // num_bits # Reshuffle to marlin_24 format perm, _, _ = get_permutations_24(num_bits) q_w = marlin_permute_weights(weight, size_k, size_n, perm, tile) q_w = q_w.cpu().numpy().astype(np.uint32) q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor), dtype=np.uint32) for i in range(pack_factor): q_packed |= q_w[:, i::pack_factor] << num_bits * i q_packed = torch.from_numpy(q_packed.astype(np.int32)) return q_packed def pack_scales_24(scales, quantization_args, w_shape): size_k = w_shape[0] size_n = w_shape[1] num_bits = quantization_args.num_bits _, scale_perm_2_4, scale_perm_single_2_4 = get_permutations_24(num_bits) if ( quantization_args.strategy == QuantizationStrategy.GROUP and quantization_args.group_size < size_k ): scales = scales.reshape((-1, len(scale_perm_2_4)))[:, scale_perm_2_4] else: # channelwise scales = scales.reshape((-1, len(scale_perm_single_2_4)))[ :, scale_perm_single_2_4 ] scales = scales.reshape((-1, size_n)).contiguous() return scales compressed-tensors-0.9.4/src/compressed_tensors/config/000077500000000000000000000000001500222531600233345ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/config/__init__.py000066400000000000000000000013411500222531600254440ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .base import * from .dense import * from .sparse_24_bitmask import * from .sparse_bitmask import * compressed-tensors-0.9.4/src/compressed_tensors/config/base.py000066400000000000000000000066271500222531600246330ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from enum import Enum, unique from typing import List, Optional from compressed_tensors.registry import RegistryMixin from pydantic import BaseModel __all__ = ["SparsityCompressionConfig", "CompressionFormat", "SparsityStructure"] @unique class CompressionFormat(Enum): dense = "dense" sparse_bitmask = "sparse-bitmask" sparse_24_bitmask = "sparse-24-bitmask" int_quantized = "int-quantized" float_quantized = "float-quantized" naive_quantized = "naive-quantized" pack_quantized = "pack-quantized" marlin_24 = "marlin-24" @unique class SparsityStructure(Enum): """ An enumeration to represent different sparsity structures. Attributes ---------- TWO_FOUR : str Represents a 2:4 sparsity structure. ZERO_ZERO : str Represents a 0:0 sparsity structure. UNSTRUCTURED : str Represents an unstructured sparsity structure. Examples -------- >>> SparsityStructure('2:4') >>> SparsityStructure('unstructured') >>> SparsityStructure('2:4') == SparsityStructure.TWO_FOUR True >>> SparsityStructure('UNSTRUCTURED') == SparsityStructure.UNSTRUCTURED True >>> SparsityStructure(None) == SparsityStructure.UNSTRUCTURED True >>> SparsityStructure('invalid') Traceback (most recent call last): ... ValueError: invalid is not a valid SparsityStructure """ TWO_FOUR = "2:4" UNSTRUCTURED = "unstructured" ZERO_ZERO = "0:0" def __new__(cls, value): obj = object.__new__(cls) obj._value_ = value.lower() if value is not None else value return obj @classmethod def _missing_(cls, value): # Handle None and case-insensitive values if value is None: return cls.UNSTRUCTURED for member in cls: if member.value == value.lower(): return member raise ValueError(f"{value} is not a valid {cls.__name__}") class SparsityCompressionConfig(RegistryMixin, BaseModel): """ Base data class for storing sparsity compression parameters :param format: name of compression format :param targets: List of layer names or layer types that aren't sparse and should be ignored during compression. By default, assume all layers are targeted :param ignore: List of layer names (unique) to ignore from targets. Defaults to None :param global_sparsity: average sparsity of the entire model :param sparsity_structure: structure of the sparsity, such as "unstructured", "2:4", "8:16" etc """ format: str targets: Optional[List[str]] = None ignore: Optional[List[str]] = None global_sparsity: Optional[float] = 0.0 sparsity_structure: Optional[str] = "unstructured" compressed-tensors-0.9.4/src/compressed_tensors/config/dense.py000066400000000000000000000024451500222531600250110ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig __all__ = ["DenseSparsityConfig"] @SparsityCompressionConfig.register(name=CompressionFormat.dense.value) class DenseSparsityConfig(SparsityCompressionConfig): """ Identity configuration for storing a sparse model in an uncompressed dense format :param global_sparsity: average sparsity of the entire model :param sparsity_structure: structure of the sparsity, such as "unstructured", "2:4", "8:16" etc """ format: str = CompressionFormat.dense.value global_sparsity: Optional[float] = 0.0 sparsity_structure: Optional[str] = "unstructured" compressed-tensors-0.9.4/src/compressed_tensors/config/sparse_24_bitmask.py000066400000000000000000000025711500222531600272270ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional from compressed_tensors.config import ( CompressionFormat, SparsityCompressionConfig, SparsityStructure, ) __all__ = ["Sparse24BitMaskConfig"] @SparsityCompressionConfig.register(name=CompressionFormat.sparse_24_bitmask.value) class Sparse24BitMaskConfig(SparsityCompressionConfig): """ Configuration for storing a 24 sparse model using bytemask compression :param global_sparsity: average sparsity of the entire model :param sparsity_structure: structure of the sparsity, should always be "2:4" for this compression format """ format: str = CompressionFormat.sparse_24_bitmask.value global_sparsity: Optional[float] = 0.0 sparsity_structure: Optional[str] = SparsityStructure.TWO_FOUR.value compressed-tensors-0.9.4/src/compressed_tensors/config/sparse_bitmask.py000066400000000000000000000024341500222531600267200ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig __all__ = ["BitmaskConfig"] @SparsityCompressionConfig.register(name=CompressionFormat.sparse_bitmask.value) class BitmaskConfig(SparsityCompressionConfig): """ Configuration for storing a sparse model using bitmask compression :param global_sparsity: average sparsity of the entire model :param sparsity_structure: structure of the sparsity, such as "unstructured", "2:4", "8:16" etc """ format: str = CompressionFormat.sparse_bitmask.value global_sparsity: Optional[float] = 0.0 sparsity_structure: Optional[str] = "unstructured" compressed-tensors-0.9.4/src/compressed_tensors/linear/000077500000000000000000000000001500222531600233415ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/linear/__init__.py000066400000000000000000000011511500222531600254500ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/src/compressed_tensors/linear/compressed_linear.py000066400000000000000000000076141500222531600274210ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings from typing import Dict, Tuple import torch from compressed_tensors.compressors.base import BaseCompressor from compressed_tensors.quantization import ( QuantizationScheme, QuantizationStatus, initialize_module_for_quantization, ) from compressed_tensors.utils import register_offload_parameter from torch import Tensor from torch.nn import Parameter from torch.nn.functional import linear from torch.nn.modules import Linear class CompressedLinear(Linear): """ Wrapper module for running a compressed forward pass of a quantized Linear module. The wrapped layer will decompressed on each forward call. """ def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) warnings.warn( "CompressedLinear should not be initialized directly. " "Use the from_linear method instead.", UserWarning, ) @classmethod @torch.no_grad() def from_linear( cls, module: Linear, quantization_scheme: QuantizationScheme, quantization_format: str, ): """ :param module: dense linear module to replace :param quantization_scheme: quantization config for the module to wrap :param quantization_format: compression format module is stored as :return: CompressedLinear module wrapping the input module """ module.__class__ = CompressedLinear module.compressor = BaseCompressor.load_from_registry(quantization_format) device = next(module.parameters()).device # this will initialize all the scales and zero points initialize_module_for_quantization( module, quantization_scheme, force_zero_point=False ) # get the shape and dtype of compressed parameters compression_params: Dict[str, Tuple] = module.compressor.compression_param_info( module.weight.shape, quantization_scheme.weights ) # no need for this once quantization is initialized, will be replaced # with the compressed parameter delattr(module, "weight") # populate compressed weights and quantization parameters for name, (shape, dtype) in compression_params.items(): param = Parameter( torch.empty(shape, device=device, dtype=dtype), requires_grad=False ) register_offload_parameter(module, name, param) # mark module as compressed module.quantization_status = QuantizationStatus.COMPRESSED # handles case where forward is wrapped in new_forward by accelerate hooks if hasattr(module, "_old_forward"): module._old_forward = CompressedLinear.forward.__get__( module, CompressedLinear ) return module def forward(self, input: Tensor) -> Tensor: """ Decompresses the weight, then runs the wrapped forward pass """ if self.quantization_status == QuantizationStatus.COMPRESSED: weight_data = self.compressor.decompress_module(self) param = Parameter(weight_data, requires_grad=False) register_offload_parameter(self, "weight", param) self.quantization_status = QuantizationStatus.FROZEN return linear(input, self.weight, self.bias) compressed-tensors-0.9.4/src/compressed_tensors/quantization/000077500000000000000000000000001500222531600246155ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/quantization/__init__.py000066400000000000000000000013701500222531600267270ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa # isort: skip_file from .quant_args import * from .quant_config import * from .quant_scheme import * from .lifecycle import * compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/000077500000000000000000000000001500222531600265545ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/__init__.py000066400000000000000000000014041500222531600306640ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa # isort: skip_file from .forward import * from .initialize import * from .compressed import * from .apply import * from .helpers import * compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/apply.py000066400000000000000000000432271500222531600302630ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import re from collections import OrderedDict, defaultdict from copy import deepcopy from typing import Dict, Iterable, List, Optional from typing import OrderedDict as OrderedDictType from typing import Set, Union import torch from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization.lifecycle.compressed import ( compress_quantized_weights, ) from compressed_tensors.quantization.lifecycle.initialize import ( initialize_module_for_quantization, ) from compressed_tensors.quantization.quant_args import QuantizationArgs from compressed_tensors.quantization.quant_config import ( QuantizationConfig, QuantizationStatus, ) from compressed_tensors.quantization.quant_scheme import QuantizationScheme from compressed_tensors.quantization.utils import ( KV_CACHE_TARGETS, infer_quantization_status, is_kv_cache_quant_scheme, iter_named_leaf_modules, iter_named_quantizable_modules, ) from compressed_tensors.utils.helpers import fix_fsdp_module_name, replace_module from compressed_tensors.utils.offload import update_parameter_data from compressed_tensors.utils.safetensors_load import get_safetensors_folder from safetensors import safe_open from torch.nn import Module __all__ = [ "load_pretrained_quantization_parameters", "apply_quantization_config", "apply_quantization_status", "find_name_or_class_matches", "expand_target_names", "is_target", ] from compressed_tensors.quantization.utils.helpers import is_module_quantized from compressed_tensors.utils.safetensors_load import ( get_quantization_parameter_to_path_mapping, ) _LOGGER = logging.getLogger(__name__) def load_pretrained_quantization_parameters( model: Module, model_name_or_path: Optional[str] = None, load_weight_quantization: Optional[bool] = False, ): """ Loads the quantization parameters (scale and zero point) from model_name_or_path to a model that has already been initialized with a quantization config. NOTE: Will always load inputs/output parameters. Will conditioanlly load weight parameters, if load_weight_quantization is set to True. :param model: model to load pretrained quantization parameters to :param model_name_or_path: Hugging Face stub or local folder containing a quantized model, which is used to load quantization parameters :param load_weight_quantization: whether or not the weight quantization parameters shoud be laoded """ model_path = get_safetensors_folder(model_name_or_path) mapping = get_quantization_parameter_to_path_mapping(model_path) for name, submodule in iter_named_leaf_modules(model): if not is_module_quantized(submodule): continue if submodule.quantization_scheme.input_activations is not None: base_name = "input" _load_quant_args_from_mapping( base_name=base_name, module_name=name, module=submodule, mapping=mapping, ) if submodule.quantization_scheme.output_activations is not None: base_name = "output" _load_quant_args_from_mapping( base_name=base_name, module_name=name, module=submodule, mapping=mapping, ) if load_weight_quantization and submodule.quantization_scheme.weights: base_name = "weight" _load_quant_args_from_mapping( base_name=base_name, module_name=name, module=submodule, mapping=mapping, ) def apply_quantization_config( model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False ) -> OrderedDict: """ Initializes the model for quantization in-place based on the given config. Optionally coverts quantizable modules to compressed_linear modules :param model: model to apply quantization config to :param config: quantization config :param run_compressed: Whether the model will be run in compressed mode or decompressed fully on load """ # Workaround for when HF Quantizer passes None, see PR #180 if config is None: return OrderedDict() # remove reference to the original `config` # argument. This function can mutate it, and we'd # like to keep the original `config` as it is. config = deepcopy(config) # build mapping of targets to schemes for easier matching # use ordered dict to preserve target ordering in config target_to_scheme = OrderedDict() config = process_quantization_config(config) names_to_scheme = OrderedDict() for scheme in config.config_groups.values(): for target in scheme.targets: target_to_scheme[target] = scheme if run_compressed: from compressed_tensors.linear.compressed_linear import CompressedLinear # list of submodules to ignore ignored_submodules = defaultdict(list) # mark appropriate layers for quantization by setting their quantization schemes for name, submodule in iter_named_quantizable_modules( model, include_children=True, include_attn=True, ): # child modules and attention modules # potentially fix module name to remove FSDP wrapper prefix name = fix_fsdp_module_name(name) if matches := find_name_or_class_matches(name, submodule, config.ignore): for match in matches: ignored_submodules[match].append(name) continue # layer matches ignore list, continue targets = find_name_or_class_matches(name, submodule, target_to_scheme) if targets: # mark modules to be quantized by adding # quant scheme to the matching layers scheme = _scheme_from_targets(target_to_scheme, targets, name) if run_compressed: format = config.format if format != CompressionFormat.dense.value: if isinstance(submodule, torch.nn.Linear): # TODO: expand to more module types compressed_linear = CompressedLinear.from_linear( submodule, quantization_scheme=scheme, quantization_format=format, ) replace_module(model, name, compressed_linear) # target matched - add layer and scheme to target list submodule.quantization_scheme = _scheme_from_targets( target_to_scheme, targets, name ) names_to_scheme[name] = submodule.quantization_scheme.weights if config.ignore is not None and ignored_submodules is not None: if set(config.ignore) - set(ignored_submodules): _LOGGER.warning( "Some layers that were to be ignored were " "not found in the model: " f"{set(config.ignore) - set(ignored_submodules)}" ) # apply current quantization status across all targeted layers apply_quantization_status(model, config.quantization_status) return names_to_scheme def process_quantization_config(config: QuantizationConfig) -> QuantizationConfig: """ Preprocess the raw QuantizationConfig :param config: the raw QuantizationConfig :return: the processed QuantizationConfig """ if config.kv_cache_scheme is not None: config = process_kv_cache_config(config) return config def process_kv_cache_config( config: QuantizationConfig, targets: Union[List[str], str] = KV_CACHE_TARGETS ) -> QuantizationConfig: """ Reformulate the `config.kv_cache` as a `config_group` and add it to the set of existing `config.groups` :param config: the QuantizationConfig :return: the QuantizationConfig with additional "kv_cache" group """ if targets == KV_CACHE_TARGETS: _LOGGER.info(f"KV cache targets set to default value of: {KV_CACHE_TARGETS}") kv_cache_dict = config.kv_cache_scheme.model_dump() kv_cache_scheme = QuantizationScheme( output_activations=QuantizationArgs(**kv_cache_dict), targets=targets, ) kv_cache_group = dict(kv_cache=kv_cache_scheme) config.config_groups.update(kv_cache_group) return config def apply_quantization_status(model: Module, status: QuantizationStatus): """ Applies in place the quantization lifecycle up to the given status :param model: model to apply quantization to :param status: status to update the module to """ current_status = infer_quantization_status(model) if status >= QuantizationStatus.INITIALIZED > current_status: force_zero_point_init = status != QuantizationStatus.COMPRESSED # When decompressing, we set the scale_dtype as the model's dtype # This is because the normal workflow of using the weight's dtype # will be incorrect as the model weight will be compressed # Therfore, use the dtype set by the user using the PretrainedModel scale_dtype = None if status == QuantizationStatus.FROZEN: if hasattr(model, "dtype"): scale_dtype = model.dtype model.apply( lambda module: initialize_module_for_quantization( module, force_zero_point=force_zero_point_init, scale_dtype=scale_dtype ) ) if current_status < status >= QuantizationStatus.COMPRESSED > current_status: model.apply(compress_quantized_weights) def expand_target_names( model: Module, targets: Optional[Iterable[str]] = None, ignore: Optional[Iterable[str]] = None, ) -> Set[str]: """ Finds all unique module names in the model that match the given targets and ignore lists. Note: Targets must be regexes, layer types, or full layer names. :param model: model to search for targets in :param targets: Iterable of targets to search for :param ignore: Iterable of targets to ignore :return: set of all targets that match the given targets and should not be ignored """ return { name for name, module in iter_named_leaf_modules(model) if is_target(name, module, targets, ignore) } def is_target( name: str, module: Module, targets: Optional[Iterable[str]] = None, ignore: Optional[Iterable[str]] = None, ) -> bool: """ Determines if a module should be included in the targets based on the targets and ignore lists. Note: Targets must be regexes, layer types, or full layer names. :param name: name of the module :param module: the module itself :param targets: Iterable of targets to search for :param ignore: Iterable of targets to ignore :return: True if the module is a target and not ignored, False otherwise """ return bool( find_name_or_class_matches(name, module, targets or []) and not find_name_or_class_matches(name, module, ignore or []) ) def find_name_or_class_matches( name: str, module: Module, targets: Iterable[str], check_contains: bool = False ) -> List[str]: """ Returns all targets that match the given name or the class name. Returns empty list otherwise. The order of the output `matches` list matters. The entries are sorted in the following order: 1. matches on exact strings 2. matches on regex patterns 3. matches on module names """ targets = sorted(targets, key=lambda x: ("re:" in x, x)) if isinstance(targets, Iterable): matches = _find_matches(name, targets) + _find_matches( module.__class__.__name__, targets, check_contains ) matches = [match for match in matches if match is not None] return matches def _find_matches( value: str, targets: Iterable[str], check_contains: bool = False ) -> List[str]: # returns all the targets that match value either # exactly or as a regex after 're:'. if check_contains is set to True, # additionally checks if the target string is contained with value. matches = [] for target in targets: if target.startswith("re:"): pattern = target[3:] if re.match(pattern, value): matches.append(target) elif check_contains: if target.lower() in value.lower(): matches.append(target) elif target == value: matches.append(target) return matches def _infer_status(model: Module) -> Optional[QuantizationStatus]: for module in model.modules(): status = getattr(module, "quantization_status", None) if status is not None: return status return None def _load_quant_args_from_mapping( base_name: str, module_name: str, module: Module, mapping: Dict ): # TODO: skip update and just register here, don't do it in initialize """ Loads scale and zero point from a state_dict into the specified module :param base_name: quantization target, one of: weights, input_activations or output_activations :param module_name: pytorch module name to look up in state_dict :module: pytorch module associated with module_name :mapping: mapping to search fetch paths on disk for a given parameter """ scale_name = f"{base_name}_scale" zp_name = f"{base_name}_zero_point" g_idx_name = f"{base_name}_g_idx" state_dict_scale_path = mapping.get(f"{module_name}.{scale_name}", None) state_dict_zp_path = mapping.get(f"{module_name}.{zp_name}", None) state_dict_g_idx_path = mapping.get(f"{module_name}.{g_idx_name}", None) if state_dict_g_idx_path is not None: with safe_open(state_dict_g_idx_path, framework="pt", device="cpu") as f: state_dict_g_idx = f.get_tensor(f"{module_name}.{g_idx_name}") update_parameter_data(module, state_dict_g_idx, g_idx_name) if state_dict_scale_path is not None: # module is quantized with safe_open(state_dict_scale_path, framework="pt", device="cpu") as f: state_dict_scale = f.get_tensor(f"{module_name}.{scale_name}") update_parameter_data(module, state_dict_scale, scale_name) if state_dict_zp_path is None: # fill in zero point for symmetric quantization state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu") else: with safe_open(state_dict_zp_path, framework="pt", device="cpu") as f: state_dict_zp = f.get_tensor(f"{module_name}.{zp_name}") update_parameter_data(module, state_dict_zp, zp_name) def _scheme_from_targets( target_to_scheme: OrderedDictType[str, QuantizationScheme], targets: List[str], name: str, ) -> QuantizationScheme: if len(targets) == 1: # if `targets` iterable contains a single element # use it as the key return target_to_scheme[targets[0]] # otherwise, we need to merge QuantizationSchemes corresponding # to multiple targets. This is most likely because `name` module # is being target both as an ordinary quantization target, as well # as kv cache quantization target schemes_to_merge = [target_to_scheme[target] for target in targets] return _merge_schemes(schemes_to_merge, name) def _merge_schemes( schemes_to_merge: List[QuantizationScheme], name: str ) -> QuantizationScheme: kv_cache_quantization_scheme = [ scheme for scheme in schemes_to_merge if is_kv_cache_quant_scheme(scheme) ] if not kv_cache_quantization_scheme: # if the schemes_to_merge do not contain any # kv cache QuantizationScheme # return the first scheme (the prioritized one, # since the order of schemes_to_merge matters) return schemes_to_merge[0] else: # fetch the kv cache QuantizationScheme and the highest # priority non-kv cache QuantizationScheme and merge them kv_cache_quantization_scheme = kv_cache_quantization_scheme[0] quantization_scheme = [ scheme for scheme in schemes_to_merge if not is_kv_cache_quant_scheme(scheme) ][0] schemes_to_merge = [kv_cache_quantization_scheme, quantization_scheme] merged_scheme = {} for scheme in schemes_to_merge: scheme_dict = { k: v for k, v in scheme.model_dump().items() if v is not None } # when merging multiple schemes, the final target will be # the `name` argument - hence erase the original targets del scheme_dict["targets"] # make sure that schemes do not "clash" with each other overlapping_keys = set(merged_scheme.keys()) & set(scheme_dict.keys()) if overlapping_keys: raise ValueError( f"The module: {name} is being modified by two clashing " f"quantization schemes, that jointly try to override " f"properties: {overlapping_keys}. Fix the quantization config " "so that it is not ambiguous." ) merged_scheme.update(scheme_dict) merged_scheme.update(targets=[name]) return QuantizationScheme(**merged_scheme) compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/compressed.py000066400000000000000000000043701500222531600312760ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import torch from compressed_tensors.quantization.lifecycle.forward import quantize from compressed_tensors.quantization.quant_config import QuantizationStatus from torch.nn import Module __all__ = [ "compress_quantized_weights", ] _LOGGER = logging.getLogger(__name__) def compress_quantized_weights(module: Module): """ Quantizes the module weight representation to use fewer bits in memory apply to full model with `model.apply(compress_quantized_weights)` :param module: module to compress to quantized representation """ scheme = getattr(module, "quantization_scheme", None) if not scheme or not scheme.weights: # no quantization scheme or weights not quantized, nothing to do return if scheme is QuantizationStatus.COMPRESSED: # module is already compressed, nothing to do return weight = getattr(module, "weight", None) scale = getattr(module, "weight_scale", None) zero_point = getattr(module, "weight_zero_point", None) g_idx = getattr(module, "weight_g_idx", None) if weight is None or scale is None: # no weight, scale, or ZP, nothing to do # mark as compressed here to maintain consistent status throughout the model module.quantization_status = QuantizationStatus.COMPRESSED return module.weight.requires_grad = False # cannot use auto grad after compression module.weight.data = quantize( x=weight, scale=scale, zero_point=zero_point, g_idx=g_idx, args=scheme.weights, dtype=torch.int8, ) module.quantization_status = QuantizationStatus.COMPRESSED compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/forward.py000066400000000000000000000312411500222531600305730ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from functools import wraps from math import ceil from typing import Optional import torch from compressed_tensors.quantization.quant_args import ( QuantizationArgs, QuantizationStrategy, round_to_quantized_type, ) from compressed_tensors.quantization.quant_config import QuantizationStatus from compressed_tensors.quantization.quant_scheme import QuantizationScheme from compressed_tensors.quantization.utils import ( calculate_range, compute_dynamic_scales_and_zp, ) from compressed_tensors.utils import safe_permute from torch.nn import Module __all__ = [ "quantize", "dequantize", "fake_quantize", "wrap_module_forward_quantized", "forward_quantize", ] @torch.no_grad() def quantize( x: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor, args: QuantizationArgs, dtype: Optional[torch.dtype] = None, g_idx: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Quantize the input tensor x using the QuantizationStrategy specified in args. Quantization can be done per tensor, channel, token or group. For group quantization, the group_size must be divisible by the column size. The input scale and zero_points are reshaped to support vectorization (Assumes 1 is the channel dimension) :param x: Input tensor :param scale: scale tensor :param zero_point: zero point tensor :param args: quantization args dictating how to quantize x :param dtype: optional dtype to cast the quantized output to :param g_idx: optional mapping from column index to group index :return: fake quantized tensor """ return _process_quantization( x=x, scale=scale, zero_point=zero_point, args=args, dtype=dtype, do_quantize=True, do_dequantize=False, g_idx=g_idx, ) @torch.no_grad() def dequantize( x_q: torch.Tensor, scale: torch.Tensor, zero_point: Optional[torch.Tensor] = None, args: Optional[QuantizationArgs] = None, dtype: Optional[torch.dtype] = None, g_idx: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Dequantize a quantized input tensor x_q based on the strategy specified in args. If args is not provided, the strategy will be inferred. :param x: quantized input tensor :param scale: scale tensor :param zero_point: zero point tensor :param args: quantization args used to quantize x_q :param dtype: optional dtype to cast the dequantized output to :param g_idx: optional mapping from column index to group index :return: dequantized float tensor """ if args is None: if scale.ndim == 0 or scale.ndim == 1: args = QuantizationArgs(strategy=QuantizationStrategy.TENSOR) elif scale.ndim == 2: if scale.shape[1] == 1: args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL) else: group_size = int(x_q.shape[1] / scale.shape[1]) args = QuantizationArgs( strategy=QuantizationStrategy.GROUP, group_size=group_size ) else: raise ValueError( f"Could not infer a quantization strategy from scale with {scale.ndim} " "dimmensions. Expected 0 or 2 dimmensions." ) if dtype is None: dtype = scale.dtype return _process_quantization( x=x_q, scale=scale, zero_point=zero_point, args=args, do_quantize=False, do_dequantize=True, dtype=dtype, g_idx=g_idx, ) @torch.no_grad() def fake_quantize( x: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor, args: QuantizationArgs, g_idx: Optional[torch.Tensor] = None, ) -> torch.Tensor: """ Fake quantize the input tensor x by quantizing then dequantizing with the QuantizationStrategy specified in args. Quantization can be done per tensor, channel, token or group. For group quantization, the group_size must be divisible by the column size. The input scale and zero_points are reshaped to support vectorization (Assumes 1 is the channel dimension) :param x: Input tensor :param scale: scale tensor :param zero_point: zero point tensor :param args: quantization args dictating how to quantize x :param g_idx: optional mapping from column index to group index :return: fake quantized tensor """ return _process_quantization( x=x, scale=scale, zero_point=zero_point, args=args, do_quantize=True, do_dequantize=True, g_idx=g_idx, ) @torch.no_grad() def _process_quantization( x: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor, args: QuantizationArgs, g_idx: Optional[torch.Tensor] = None, dtype: Optional[torch.dtype] = None, do_quantize: bool = True, do_dequantize: bool = True, ) -> torch.Tensor: q_min, q_max = calculate_range(args, x.device) group_size = args.group_size if args.strategy == QuantizationStrategy.GROUP: output_dtype = dtype if dtype is not None else x.dtype output = torch.zeros_like(x).to(output_dtype) columns = output.shape[1] # TODO: make validation step for inputs while scale.ndim < 2: # pad scale and zero point dims for slicing scale = scale.unsqueeze(1) zero_point = zero_point.unsqueeze(1) if zero_point is not None else None if columns >= group_size: if columns % group_size != 0: raise ValueError( "tensor column shape must be divisble " f"by the given group_size {group_size}" ) # support column-order (default) quantization as well as other orderings # such as activation ordering. Below checks if g_idx has been initialized is_column_order = g_idx is None or -1 in g_idx if is_column_order: num_groups = int(ceil(columns / group_size)) group_sizes = torch.full((num_groups,), group_size, dtype=torch.int) else: group_indices, group_sizes = torch.unique(g_idx, return_counts=True) group_sizes = group_sizes[torch.argsort(group_indices)] perm = torch.argsort(g_idx) x = safe_permute(x, perm, dim=1) # TODO: experiment with vectorizing for loop for performance end = 0 for index, group_count in enumerate(group_sizes): sc = scale[:, index].view(-1, 1) zp = zero_point[:, index].view(-1, 1) if zero_point is not None else None start = end end = start + group_count if do_quantize: output[:, start:end] = _quantize( x[:, start:end], sc, zp, q_min, q_max, args, dtype=dtype, ) if do_dequantize: input = output[:, start:end] if do_quantize else x[:, start:end] output[:, start:end] = _dequantize(input, sc, zp) if not is_column_order: output = safe_permute(output, torch.argsort(perm), dim=1) else: # covers channel, token and tensor strategies if do_quantize: output = _quantize( x, scale, zero_point, q_min, q_max, args, dtype=dtype, ) if do_dequantize: output = _dequantize(output if do_quantize else x, scale, zero_point) return output def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme): # expects a module already initialized and injected with the parameters in # initialize_module_for_quantization if hasattr(module.forward, "__func__"): forward_func_orig = module.forward.__func__ else: forward_func_orig = module.forward.func @wraps(forward_func_orig) # ensures docstring, names, etc are propagated def wrapped_forward(self, *args, **kwargs): if not getattr(module, "quantization_enabled", True): # quantization is disabled on forward passes, return baseline # forward call return forward_func_orig.__get__(module, module.__class__)(*args, **kwargs) input_ = args[0] compressed = module.quantization_status == QuantizationStatus.COMPRESSED if scheme.input_activations is not None: # prehook should calibrate activations before forward call input_ = forward_quantize(module, input_, "input", scheme.input_activations) if scheme.weights is not None and not compressed: # calibrate and (fake) quantize weights when applicable unquantized_weight = self.weight.data.clone() self.weight.data = forward_quantize( module, self.weight, "weight", scheme.weights ) # perform wrapped forward call output = forward_func_orig.__get__(module, module.__class__)( input_, *args[1:], **kwargs ) # restore back to unquantized_value if scheme.weights is not None and not compressed: self.weight.data = unquantized_weight if scheme.output_activations is not None: # forward-hook should calibrate/forward_quantize if ( module.quantization_status == QuantizationStatus.CALIBRATION and not scheme.output_activations.dynamic ): return output output = forward_quantize( module, output, "output", scheme.output_activations ) return output # bind wrapped forward to module class so reference to `self` is correct bound_wrapped_forward = wrapped_forward.__get__(module, module.__class__) # set forward to wrapped forward setattr(module, "forward", bound_wrapped_forward) def forward_quantize( module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs" ) -> torch.Tensor: # in compressed mode, the weight is already compressed and quantized so we don't # need to run fake quantization if ( module.quantization_status == QuantizationStatus.COMPRESSED and base_name == "weight" ): return value if value.numel() == 0: # if the tensor is empty, # skip quantization return value g_idx = getattr(module, "weight_g_idx", None) if args.dynamic: # dynamic quantization - determine the scale/zp on the fly scale, zero_point = compute_dynamic_scales_and_zp(value=value, args=args) else: # static quantization - get scale and zero point from layer scale = getattr(module, f"{base_name}_scale") zero_point = getattr(module, f"{base_name}_zero_point", None) return fake_quantize( x=value, scale=scale, zero_point=zero_point, args=args, g_idx=g_idx, ) @torch.no_grad() def _quantize( x: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor, q_min: torch.Tensor, q_max: torch.Tensor, args: QuantizationArgs, dtype: Optional[torch.dtype] = None, ) -> torch.Tensor: scaled = x / scale if zero_point is not None: scaled += zero_point.to(x.dtype) # clamp first because cast isn't guaranteed to be saturated (ie for fp8) clamped_value = torch.clamp( scaled, q_min, q_max, ) quantized_value = round_to_quantized_type(clamped_value, args) if dtype is not None: quantized_value = quantized_value.to(dtype) return quantized_value @torch.no_grad() def _dequantize( x_q: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor = None, dtype: Optional[torch.dtype] = None, ) -> torch.Tensor: dequant_value = x_q.to(scale.dtype) if zero_point is not None: dequant_value = dequant_value - zero_point.to(scale.dtype) dequant_value = dequant_value * scale if dtype is not None: dequant_value = dequant_value.to(dtype) return dequant_value compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/helpers.py000066400000000000000000000016601500222531600305730ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Miscelaneous helpers for the quantization lifecycle """ from torch.nn import Module __all__ = [ "enable_quantization", "disable_quantization", ] def enable_quantization(module: Module): module.quantization_enabled = True def disable_quantization(module: Module): module.quantization_enabled = False compressed-tensors-0.9.4/src/compressed_tensors/quantization/lifecycle/initialize.py000066400000000000000000000175121500222531600312750ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from enum import Enum from typing import Optional import torch from compressed_tensors.quantization.lifecycle.forward import ( wrap_module_forward_quantized, ) from compressed_tensors.quantization.quant_args import ( ActivationOrdering, QuantizationArgs, QuantizationStrategy, ) from compressed_tensors.quantization.quant_config import QuantizationStatus from compressed_tensors.quantization.quant_scheme import QuantizationScheme from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme from compressed_tensors.utils import ( disable_hf_hook, has_offloaded_params, register_offload_parameter, ) from torch.nn import Module, Parameter __all__ = [ "initialize_module_for_quantization", "is_attention_module", "KVCacheScaleType", ] _LOGGER = logging.getLogger(__name__) class KVCacheScaleType(Enum): KEY = "k_scale" VALUE = "v_scale" def initialize_module_for_quantization( module: Module, scheme: Optional[QuantizationScheme] = None, force_zero_point: bool = True, scale_dtype: Optional[torch.dtype] = None, ): """ attaches appropriate scales, zero points, and observers to a layer given its target quantization scheme apply to full model with `model.apply(initialize_module_for_quantization)` :param module: module to set for calibration :param scheme: scheme to use for quantization. if None is provided, will attempt to use scheme stored in the module under `quantization_scheme`, if not provided, the layer will be skipped :param force_zero_point: whether to force initialization of a zero point for symmetric quantization :param scale_dtype: dtype to used for the scales, if overriding the weight dtype as the scale dtype """ # TODO: don't initialize parameters when running decompression scheme = scheme or getattr(module, "quantization_scheme", None) if scheme is None: # no scheme passed and layer not targeted for quantization - skip return if is_attention_module(module): # quantized actions based on calltime status _initialize_attn_scales(module) else: if scheme.input_activations is not None: _initialize_scale_zero_point( module, "input", scheme.input_activations, force_zero_point=force_zero_point, scale_dtype=scale_dtype, ) if scheme.weights is not None: if hasattr(module, "weight"): weight_shape = None if isinstance(module, torch.nn.Linear): weight_shape = module.weight.shape _initialize_scale_zero_point( module, "weight", scheme.weights, weight_shape=weight_shape, force_zero_point=force_zero_point, scale_dtype=scale_dtype, ) else: _LOGGER.warning( f"module type {type(module)} targeted for weight quantization but " "has no attribute weight, skipping weight quantization " f"for {type(module)}" ) if scheme.output_activations is not None: if not is_kv_cache_quant_scheme(scheme): _initialize_scale_zero_point( module, "output", scheme.output_activations, scale_dtype=scale_dtype ) module.quantization_scheme = scheme module.quantization_status = QuantizationStatus.INITIALIZED with disable_hf_hook(module): # wrap forward call of module to perform # quantized actions based on calltime status wrap_module_forward_quantized(module, scheme) def is_attention_module(module: Module): return "attention" in module.__class__.__name__.lower() and ( hasattr(module, "k_proj") or hasattr(module, "v_proj") or hasattr(module, "qkv_proj") ) def _initialize_scale_zero_point( module: Module, base_name: str, quantization_args: QuantizationArgs, weight_shape: Optional[torch.Size] = None, force_zero_point: bool = True, scale_dtype: Optional[torch.dtype] = None, ): if quantization_args.dynamic: return # begin on the same device as other parameters or cpu if offloaded. # in the offloaded case, there's no point moving tensors to the execution device # if they're going to be immediately offloaded by `register_offload_parameter` params_device = next(module.parameters()).device device = "cpu" if has_offloaded_params(module) else params_device # infer expected scale/zero point shape if quantization_args.strategy == QuantizationStrategy.TOKEN: expected_shape = (1, 1) else: expected_shape = 1 if base_name == "weight" and weight_shape is not None: if quantization_args.strategy == QuantizationStrategy.CHANNEL: # (output_channels, 1) expected_shape = (weight_shape[0], 1) elif quantization_args.strategy == QuantizationStrategy.GROUP: num_groups = weight_shape[1] // quantization_args.group_size expected_shape = (weight_shape[0], max(num_groups, 1)) scale_dtype = scale_dtype if scale_dtype is not None else module.weight.dtype # TODO: consider erroring out in the future as if the dtype if not one fo these, # there is likely bug if scale_dtype not in [torch.float16, torch.bfloat16, torch.float32]: scale_dtype = torch.float16 # initializes empty scale, zero point, and g_idx parameters for the module init_scale = Parameter( torch.empty(expected_shape, dtype=scale_dtype, device=device), requires_grad=False, ) register_offload_parameter(module, f"{base_name}_scale", init_scale) if force_zero_point or not quantization_args.symmetric: zp_dtype = quantization_args.pytorch_dtype() init_zero_point = Parameter( torch.zeros(expected_shape, device=device, dtype=zp_dtype), requires_grad=False, ) register_offload_parameter(module, f"{base_name}_zero_point", init_zero_point) # only grouped activation ordering has g_idx if quantization_args.actorder == ActivationOrdering.GROUP: g_idx_shape = (weight_shape[1],) g_idx_dtype = torch.int init_g_idx = Parameter( torch.full(g_idx_shape, -1, device=device, dtype=g_idx_dtype), requires_grad=False, ) register_offload_parameter(module, f"{base_name}_g_idx", init_g_idx) def _initialize_attn_scales(module: Module) -> None: """Initlaize k_scale, v_scale for self_attn""" expected_shape = 1 # per tensor param = next(module.parameters()) scale_dtype = param.dtype device = param.device init_scale = Parameter( torch.empty(expected_shape, dtype=scale_dtype, device=device), requires_grad=False, ) register_offload_parameter(module, KVCacheScaleType.KEY.value, init_scale) init_scale = Parameter( torch.empty(expected_shape, dtype=scale_dtype, device=device), requires_grad=False, ) register_offload_parameter(module, KVCacheScaleType.VALUE.value, init_scale) compressed-tensors-0.9.4/src/compressed_tensors/quantization/quant_args.py000066400000000000000000000216351500222531600273420ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings from enum import Enum from typing import Any, Dict, Optional, Union import torch from compressed_tensors.utils import Aliasable from compressed_tensors.utils.helpers import deprecated from pydantic import BaseModel, Field, field_validator, model_validator __all__ = [ "FP8_DTYPE", "QuantizationType", "QuantizationStrategy", "QuantizationArgs", "round_to_quantized_type", "ActivationOrdering", ] FP8_DTYPE = torch.float8_e4m3fn class QuantizationType(str, Enum): """ Enum storing quantization type options """ INT = "int" FLOAT = "float" class QuantizationStrategy(str, Enum): """ Enum storing quantization strategy options """ TENSOR = "tensor" CHANNEL = "channel" GROUP = "group" BLOCK = "block" TOKEN = "token" class ActivationOrdering(Aliasable, str, Enum): """ Enum storing strategies for activation ordering Group: reorder groups and weight\n Weight: only reorder weight, not groups. Slightly lower accuracy but also lower latency when compared to group actorder\n Dynamic: alias for Group\n Static: alias for Weight\n """ GROUP = "group" WEIGHT = "weight" # aliases DYNAMIC = "dynamic" STATIC = "static" @staticmethod def get_aliases() -> Dict[str, str]: return { "dynamic": "group", "static": "weight", } class QuantizationArgs(BaseModel, use_enum_values=True): """ User facing arguments used to define a quantization config for weights or activations :param num_bits: quantization bit depth :param type: dtype to quantized to, either int or float :param symmetric: whether or not quantization scale is symmetric about zero-point :param strategy: string id determining the scope of scale/zero-point to apply :param group_size: group length to use for the group strategy :param block_structure: 2d block structure to use for the block strategy, must be of the format "2x4", "8x16", etc. :param dynamic: set True to perform dynamic quantization - values will not be calibrated during calibration phase, instead during inference new quantization ranges will be observed with every sample. Defaults to False for static quantization. Note that enabling dynamic quantization will change the default observer to a memoryless one :param actorder: whether to apply group quantization in decreasing order of activation. Defaults to None for arbitrary ordering """ num_bits: int = 8 type: QuantizationType = QuantizationType.INT symmetric: bool = True group_size: Optional[int] = None strategy: Optional[QuantizationStrategy] = None block_structure: Optional[str] = None dynamic: bool = False actorder: Union[ActivationOrdering, bool, None] = None observer: Optional[str] = Field( default=None, description=( "Determines the method of computing quantization parameters (scales and " "zero-points). Defaults to min-max when not using dynamic quantization" ), ) observer_kwargs: Dict[str, Any] = Field( default_factory=dict, description=( "optional dict of kwargs to be passed directly to torch quantization " "Observers constructor excluding quantization range or symmetry" ), ) @field_validator("type", mode="before") def validate_type(cls, value) -> QuantizationType: if isinstance(value, str): return QuantizationType(value.lower()) return value @field_validator("group_size", mode="before") def validate_group(cls, value) -> Union[int, None]: if value is None: return value if value < -1: raise ValueError( f"Invalid group size {value}. Use group_size > 0 for " "strategy='group' and group_size = -1 for 'channel'" ) return value @field_validator("strategy", mode="before") def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]: if isinstance(value, str): return QuantizationStrategy(value.lower()) return value @field_validator("actorder", mode="before") def validate_actorder(cls, value) -> Optional[ActivationOrdering]: if isinstance(value, bool): return ActivationOrdering.GROUP if value else None if isinstance(value, str): return ActivationOrdering(value.lower()) return value @model_validator(mode="after") def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]: # extract user-passed values from dictionary strategy = model.strategy group_size = model.group_size actorder = model.actorder dynamic = model.dynamic observer = model.observer # infer strategy if strategy is None: if group_size is None: strategy = QuantizationStrategy.TENSOR elif group_size > 0: strategy = QuantizationStrategy.GROUP elif group_size == -1: strategy = QuantizationStrategy.CHANNEL else: raise ValueError( f"Invalid group size {group_size}. Use group_size > 0 for " "strategy='group' and group_size = -1 for 'channel'" ) # validate strategy and group if strategy == QuantizationStrategy.GROUP: if group_size is None or group_size <= 0: raise ValueError( f"strategy {strategy} requires group_size to be " "set to a positive value" ) if ( group_size is not None and group_size > 0 and strategy != QuantizationStrategy.GROUP ): raise ValueError("group_size requires strategy to be set to 'group'") # validate activation ordering and strategy if actorder is not None and strategy != QuantizationStrategy.GROUP: raise ValueError( "Must use group quantization strategy in order to apply " "activation ordering" ) # infer observer w.r.t. dynamic if dynamic: if strategy not in ( QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, ): raise ValueError( f"One of {QuantizationStrategy.TOKEN} or " f"{QuantizationStrategy.TENSOR} must be used for dynamic ", "quantization", ) if observer is not None: if observer != "memoryless": # avoid annoying users with old configs warnings.warn( "No observer is used for dynamic quantization, setting to None" ) observer = None elif observer is None: # default to minmax for non-dynamic cases observer = "minmax" # write back modified values model.strategy = strategy model.observer = observer return model def pytorch_dtype(self) -> torch.dtype: if self.type == QuantizationType.FLOAT: return FP8_DTYPE elif self.type == QuantizationType.INT: if self.num_bits <= 8: return torch.int8 elif self.num_bits <= 16: return torch.int16 else: return torch.int32 else: raise ValueError(f"Invalid quantization type {self.type}") @deprecated("QuantizationArgs.observer") def get_observer(self) -> str: return self.observer def round_to_quantized_type( tensor: torch.Tensor, args: QuantizationArgs ) -> torch.Tensor: """ Rounds each element of the input tensor to the nearest quantized representation, keeping to original dtype :param tensor: tensor to round :param args: QuantizationArgs to pull appropriate dtype from :return: rounded tensor """ original_dtype = tensor.dtype if args.type == QuantizationType.FLOAT: rounded = tensor.to(FP8_DTYPE) elif args.type == QuantizationType.INT: rounded = torch.round(tensor) else: raise ValueError(f"Invalid quantization type {args.type}") return rounded.to(original_dtype) compressed-tensors-0.9.4/src/compressed_tensors/quantization/quant_config.py000066400000000000000000000236571500222531600276610ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from enum import Enum from typing import Dict, List, Optional, Union from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization.quant_args import QuantizationArgs from compressed_tensors.quantization.quant_scheme import ( QuantizationScheme, preset_name_to_scheme, ) from compressed_tensors.quantization.utils import ( calculate_compression_ratio, is_module_quantized, iter_named_quantizable_modules, module_type, parse_out_kv_cache_args, ) from pydantic import BaseModel, Field from torch.nn import Module __all__ = [ "QuantizationStatus", "QuantizationConfig", "LIFECYCLE_ORDER", "DEFAULT_QUANTIZATION_METHOD", "DEFAULT_QUANTIZATION_FORMAT", ] class QuantizationStatus(str, Enum): """ Enum storing the different states a quantized layer can be in Initialized: scale, zero points and observers have been attached to the layer but are set to dummy values (not yet calibrated) Calibration: scale and zero points have been calibrated through OBCQ or similar algorithm, observers are still attached Frozen: scale and zero points are finalized, observers have been deleted, weights are still in their original precision Compressed: weights have been converted to their target type or compressed to their closed approximation """ INITIALIZED = "initialized" CALIBRATION = "calibration" FROZEN = "frozen" COMPRESSED = "compressed" @classmethod def lifecycle_order(cls) -> List["QuantizationStatus"]: """ :return: list of correct quantization lifecycle order """ return def __ge__(self, other): if other is None: return True if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) >= LIFECYCLE_ORDER.index(other) def __gt__(self, other): if other is None: return True if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) > LIFECYCLE_ORDER.index(other) def __lt__(self, other): if other is None: return False if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) < LIFECYCLE_ORDER.index(other) def __le__(self, other): if other is None: return False if not isinstance(other, self.__class__): raise NotImplementedError return LIFECYCLE_ORDER.index(self) <= LIFECYCLE_ORDER.index(other) LIFECYCLE_ORDER = [ QuantizationStatus.INITIALIZED, QuantizationStatus.CALIBRATION, QuantizationStatus.FROZEN, QuantizationStatus.COMPRESSED, ] DEFAULT_QUANTIZATION_METHOD = "compressed-tensors" DEFAULT_QUANTIZATION_FORMAT = "fakequant" class QuantizationConfig(BaseModel): """ Full configuration specifying how a model is quantized. Each quantized layer is mapped to a QuantizationScheme in config_groups. :param config_groups: dict of QuantizationSchemes specifying the quantization settings for each quantized layer. A group could also be a reference to a predefined scheme name, mapped to a list of its target layers/classes :param quant_method: a constant used to differentiate sparseML quantization from other quantization configs :param format: specifies how the quantized model is stored on disk :quantization_status: specifies the current status of all quantized layers. It is assumed all layers are in the same state. :param kv_cache_scheme: optional QuantizationArgs, that specify the quantization of the kv cache. If None, kv cache is not quantized. When applying kv cache quantization to transformer AutoModelForCausalLM, the kv_cache_scheme gets converted into a QuantizationScheme that: - targets the `q_proj` and `k_proj` modules of the model. The outputs of those modules are the keys and values that might be cached - quantizes the outputs of the aformentioned layers, so that keys and values are compressed before storing them in the cache There is an explicit assumption that the model contains modules with `k_proj` and `v_proj` in their names. If this is not the case and kv_cache_scheme != None, the quantization of kv cache will fail :global_compression_ratio: optional informational config to report the model compression ratio acheived by the quantization config :ignore: optional list of layers to ignore from config_groups. Layers in this list are not quantized even if they match up with a target in config_groups """ config_groups: Dict[str, Union[QuantizationScheme, List[str]]] quant_method: str = DEFAULT_QUANTIZATION_METHOD kv_cache_scheme: Optional[QuantizationArgs] = None format: str = DEFAULT_QUANTIZATION_FORMAT quantization_status: QuantizationStatus = QuantizationStatus.INITIALIZED global_compression_ratio: Optional[float] = None ignore: Optional[List[str]] = Field(default_factory=list) def model_post_init(self, __context): """ updates any quantization schemes defined as presets to be fully loaded schemes """ for group_name, targets_or_scheme in self.config_groups.items(): if isinstance(targets_or_scheme, QuantizationScheme): continue # scheme already defined self.config_groups[group_name] = preset_name_to_scheme( name=group_name, targets=targets_or_scheme, ) def to_dict(self): # for compatibility with HFQuantizer return self.model_dump() @staticmethod def from_pretrained( model: Module, format: Optional[str] = None ) -> Optional["QuantizationConfig"]: """ Converts a model into its associated QuantizationConfig based on the QuantizationScheme attached to each quantized module :param model: model to calculate quantization scheme of :return: filled out QuantizationScheme for the input model """ quant_scheme_to_layers = [] quantization_status = None ignore = {} quantization_type_names = set() for name, submodule in iter_named_quantizable_modules( model, include_children=True, include_attn=True ): layer_type = module_type(submodule) if not is_module_quantized(submodule): if layer_type not in ignore: ignore[layer_type] = [] ignore[layer_type].append(name) else: quantization_status = submodule.quantization_status scheme = submodule.quantization_scheme quantization_type_names.add(layer_type) match_found = False for existing_scheme in quant_scheme_to_layers: if scheme == existing_scheme: match_found = True break if not match_found: quant_scheme_to_layers.append(scheme) if len(quant_scheme_to_layers) == 0: # No quantized layers return None # kv-cache only, no weight/activation quantization if ( len(quantization_type_names) == 1 and "attention" in list(quantization_type_names)[0].lower() ): quantization_type_names.add("Linear") # clean up ignore list, we can leave out layers types if none of the # instances are quantized consolidated_ignore = [] for layer_type, ignore_names in ignore.items(): if layer_type in quantization_type_names: # specific layers of a quantized type are ignored consolidated_ignore += ignore_names # else we leave it off the ignore list, doesn't fall under any of the # existing quantization schemes so it won't be quantized kv_cache_args, quant_scheme_to_layers = parse_out_kv_cache_args( quant_scheme_to_layers ) kv_cache_scheme = ( kv_cache_args.model_dump() if kv_cache_args is not None else kv_cache_args ) config_groups = {} for idx, scheme in enumerate(quant_scheme_to_layers): group_name = "group_" + str(idx) config_groups[group_name] = scheme if format is None: if quantization_status == QuantizationStatus.COMPRESSED: format = CompressionFormat.int_quantized.value else: format = CompressionFormat.dense.value return QuantizationConfig( config_groups=config_groups, quantization_status=quantization_status, kv_cache_scheme=kv_cache_scheme, global_compression_ratio=None, format=format, ignore=consolidated_ignore, ) def requires_calibration_data(self): if self.kv_cache_scheme is not None: return True for _, scheme in self.config_groups.items(): if scheme.input_activations is not None: if not scheme.input_activations.dynamic: return True if scheme.output_activations is not None: if not scheme.output_activations.dynamic: return True return False compressed-tensors-0.9.4/src/compressed_tensors/quantization/quant_scheme.py000066400000000000000000000142641500222531600276520ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from copy import deepcopy from typing import Any, Dict, List, Optional from compressed_tensors.quantization.quant_args import ( QuantizationArgs, QuantizationStrategy, QuantizationType, ) from pydantic import BaseModel, model_validator __all__ = [ "QuantizationScheme", "preset_name_to_scheme", "is_preset_scheme", ] class QuantizationScheme(BaseModel): """ Set of QuantizationArgs defining how the weights, inputs and outputs of target list of modules should be quantized :param targets: list of modules to apply the QuantizationArgs to, can be layer names, layer types or a regular expression, typically ["Linear"] :param weights: quantization config for layer weights :param input_activations: quantization config for layer inputs :param output_activations: quantization config for layer outputs """ targets: List[str] weights: Optional[QuantizationArgs] = None input_activations: Optional[QuantizationArgs] = None output_activations: Optional[QuantizationArgs] = None @model_validator(mode="after") def validate_model_after(model: "QuantizationArgs") -> Dict[str, Any]: inputs = model.input_activations outputs = model.output_activations if inputs is not None: if inputs.actorder is not None: raise ValueError("Cannot apply actorder to input activations") if outputs is not None: if outputs.actorder is not None: raise ValueError("Cannot apply actorder to output activations") return model """ Pre-Set Quantization Scheme Args """ def preset_name_to_scheme(name: str, targets: List[str]) -> QuantizationScheme: """ :param name: preset quantization settings name. must exist in upper case in PRESET_SCHEMES :param targets: list of quantization targets to be passed to the Scheme :return: new QuantizationScheme for a given name with the given targets """ name = name.upper() if name not in PRESET_SCHEMES: raise KeyError( f"Unknown preset scheme name {name}, " f"available names: {list(PRESET_SCHEMES.keys())}" ) scheme_args = deepcopy(PRESET_SCHEMES[name]) # deepcopy to avoid args references return QuantizationScheme( targets=targets, **scheme_args, ) def is_preset_scheme(name: str) -> bool: """ :param name: preset quantization settings name :return: True if the name is a preset scheme name """ return name.upper() in PRESET_SCHEMES UNQUANTIZED = dict() # 8 bit integer weights and 8 bit activations quantization INT8_W8A8 = dict( weights=QuantizationArgs( num_bits=8, type=QuantizationType.INT, strategy=QuantizationStrategy.CHANNEL, symmetric=True, dynamic=False, ), input_activations=QuantizationArgs( num_bits=8, type=QuantizationType.INT, strategy=QuantizationStrategy.TOKEN, symmetric=True, dynamic=True, observer=None, ), ) # 8 bit integer weights only quantization W8A16 = dict( weights=QuantizationArgs( num_bits=8, type=QuantizationType.INT, strategy=QuantizationStrategy.CHANNEL, symmetric=True, dynamic=False, ), ) # 4 bit integer weights only quantization W4A16 = dict( weights=QuantizationArgs( num_bits=4, type=QuantizationType.INT, strategy=QuantizationStrategy.GROUP, group_size=128, symmetric=True, dynamic=False, ), ) # 4 bit integer weights only asymmetric quantization W4A16_ASYM = dict( weights=QuantizationArgs( num_bits=4, type=QuantizationType.INT, strategy=QuantizationStrategy.GROUP, group_size=128, symmetric=False, dynamic=False, ), ) # 4 bit integer weights and 8 bit activations quantization INT8_W4A8 = dict( weights=QuantizationArgs( num_bits=4, type=QuantizationType.INT, group_size=128, strategy=QuantizationStrategy.GROUP, symmetric=True, dynamic=False, ), input_activations=QuantizationArgs( num_bits=8, type=QuantizationType.INT, strategy=QuantizationStrategy.TOKEN, symmetric=True, dynamic=True, observer=None, ), ) # FP8 weights and FP8 activations quantization FP8 = dict( weights=QuantizationArgs( num_bits=8, type=QuantizationType.FLOAT, strategy=QuantizationStrategy.TENSOR, symmetric=True, dynamic=False, ), input_activations=QuantizationArgs( num_bits=8, type=QuantizationType.FLOAT, strategy=QuantizationStrategy.TENSOR, symmetric=True, dynamic=False, ), ) # FP8 weights and FP8 dynamic activations quantization FP8_DYNAMIC = dict( weights=QuantizationArgs( num_bits=8, type=QuantizationType.FLOAT, strategy=QuantizationStrategy.CHANNEL, symmetric=True, dynamic=False, ), input_activations=QuantizationArgs( num_bits=8, type=QuantizationType.FLOAT, strategy=QuantizationStrategy.TOKEN, symmetric=True, dynamic=True, observer=None, ), ) PRESET_SCHEMES = { # Unquantized (no-op) "UNQUANTIZED": UNQUANTIZED, # Integer weight only schemes "W8A16": W8A16, "W4A16": W4A16, "W4A16_ASYM": W4A16_ASYM, # Integer weight and activation schemes "W8A8": INT8_W8A8, "INT8": INT8_W8A8, # alias for W8A8 "W4A8": INT8_W4A8, # Float weight and activation schemes "FP8": FP8, "FP8_DYNAMIC": FP8_DYNAMIC, } compressed-tensors-0.9.4/src/compressed_tensors/quantization/utils/000077500000000000000000000000001500222531600257555ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/quantization/utils/__init__.py000066400000000000000000000012201500222531600300610ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .helpers import * compressed-tensors-0.9.4/src/compressed_tensors/quantization/utils/helpers.py000066400000000000000000000340671500222531600300030ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from typing import Generator, List, Optional, Tuple import torch from compressed_tensors.quantization.quant_args import ( FP8_DTYPE, QuantizationArgs, QuantizationStrategy, QuantizationType, ) from compressed_tensors.quantization.quant_scheme import QuantizationScheme from torch import FloatTensor, IntTensor, Tensor from torch.nn import Module from tqdm import tqdm __all__ = [ "infer_quantization_status", "is_module_quantized", "is_model_quantized", "module_type", "calculate_compression_ratio", "get_torch_bit_depth", "can_quantize", "parse_out_kv_cache_args", "KV_CACHE_TARGETS", "is_kv_cache_quant_scheme", "iter_named_leaf_modules", "iter_named_quantizable_modules", "compute_dynamic_scales_and_zp", "calculate_range", "calculate_qparams", ] # target the self_attn layer # QuantizedKVParameterCache is responsible for obtaining the k_scale and v_scale KV_CACHE_TARGETS = ["re:.*self_attn$"] _LOGGER: logging.Logger = logging.getLogger(__name__) def calculate_qparams( min_vals: Tensor, max_vals: Tensor, quantization_args: QuantizationArgs ) -> Tuple[FloatTensor, IntTensor]: """ :param min_vals: tensor of min value(s) to calculate scale(s) and zero point(s) from :param max_vals: tensor of max value(s) to calculate scale(s) and zero point(s) from :param quantization_args: settings to quantization :return: tuple of the calculated scale(s) and zero point(s) """ # based on the implementations for consuming quantized values, # 0.0 must always be representable within the quantized range min_vals = torch.min(min_vals, torch.zeros_like(min_vals)) max_vals = torch.max(max_vals, torch.zeros_like(max_vals)) device = min_vals.device bit_min, bit_max = calculate_range(quantization_args, device) bit_range = bit_max - bit_min zp_dtype = quantization_args.pytorch_dtype() if quantization_args.symmetric: max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals)) scales = max_val_pos / (float(bit_range) / 2) scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps) zero_points = torch.zeros(scales.shape, device=device, dtype=min_vals.dtype) else: scales = (max_vals - min_vals) / float(bit_range) scales = torch.clamp(scales, min=torch.finfo(torch.float32).eps) zero_points = bit_min - (min_vals / scales) zero_points = torch.clamp(zero_points, bit_min, bit_max) # match zero-points to quantized type # if casting to int, use round instead of truncate if quantization_args.type == QuantizationType.INT: zero_points = torch.round(zero_points) zero_points = zero_points.to(zp_dtype) if scales.ndim == 0: scales = scales.reshape(1) zero_points = zero_points.reshape(1) return scales, zero_points def compute_dynamic_scales_and_zp(value: Tensor, args: QuantizationArgs): """ Returns the computed scales and zero points for dynamic activation quantization. :param value: tensor to calculate quantization parameters for :param args: quantization args :param reduce_dims: optional tuple of dimensions to reduce along, returned scale and zero point will be shaped (1,) along the reduced dimensions :return: tuple of scale and zero point derived from the observed tensor """ if args.strategy == QuantizationStrategy.TOKEN: dim = {1, 2} reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim) elif args.strategy == QuantizationStrategy.TENSOR: reduce_dims = None else: raise ValueError( f"One of {QuantizationStrategy.TOKEN} or {QuantizationStrategy.TENSOR} ", "must be used for dynamic quantization", ) if not reduce_dims: min_val, max_val = torch.aminmax(value) else: min_val = torch.amin(value, dim=reduce_dims, keepdims=True) max_val = torch.amax(value, dim=reduce_dims, keepdims=True) return calculate_qparams(min_val, max_val, args) def calculate_range(quantization_args: QuantizationArgs, device: str) -> Tuple: """ Calculated the effective quantization range for the given Quantization Args :param quantization_args: quantization args to get range of :param device: device to store the range to :return: tuple endpoints for the given quantization range """ if quantization_args.type == QuantizationType.INT: bit_range = 2**quantization_args.num_bits q_max = torch.tensor(bit_range / 2 - 1, device=device) q_min = torch.tensor(-bit_range / 2, device=device) elif quantization_args.type == QuantizationType.FLOAT: if quantization_args.num_bits != 8: raise ValueError( "Floating point quantization is only supported for 8 bits," f"got {quantization_args.num_bits}" ) fp_range_info = torch.finfo(FP8_DTYPE) q_max = torch.tensor(fp_range_info.max, device=device) q_min = torch.tensor(fp_range_info.min, device=device) else: raise ValueError(f"Invalid quantization type {quantization_args.type}") return q_min, q_max def infer_quantization_status(model: Module) -> Optional["QuantizationStatus"]: # noqa """ Checks the quantization status of a model. Assumes all modules in the model have the same status, so only the first quantized model is checked. :param model: model to check quantization status for :return: quantization status if the model is quantized, otherwise None """ for module in model.modules(): status = getattr(module, "quantization_status", None) if status is not None: return status return None def is_module_quantized(module: Module) -> bool: """ Check if a module is quantized, based on the existence of a non-empty quantization scheme :param module: pytorch module to check :return: True if module is quantized, False otherwise """ if not hasattr(module, "quantization_scheme"): return False if module.quantization_scheme.weights is not None: return True if module.quantization_scheme.input_activations is not None: return True if module.quantization_scheme.output_activations is not None: return True return False def is_model_quantized(model: Module) -> bool: """ Check if any modules in a model are quantized, based on the existence of a non-empty quantization scheme in at least one module :param model: pytorch model :return: True if model is quantized, False otherwise """ for _, submodule in iter_named_leaf_modules(model): if is_module_quantized(submodule): return True return False def module_type(module: Module) -> str: """ Gets a string representation of a module type :module: pytorch module to get type of :return: module type as a string """ return type(module).__name__ def iter_named_leaf_modules(model: Module) -> Generator[Tuple[str, Module], None, None]: """ Yields modules that do not have any submodules except observers. The observers themselves are not yielded :param model: model to get leaf modules of :returns: generator tuple of (name, leaf_submodule) """ for name, submodule in model.named_modules(): children = list(submodule.children()) # TODO: verify if an observer would ever be attached in this case/remove check if len(children) == 0 and "observer" in name: yield name, submodule else: if len(children) > 0: named_children, children = zip(*list(submodule.named_children())) has_non_observer_children = False for i in range(len(children)): child_name = named_children[i] if "observer" not in child_name: has_non_observer_children = True if not has_non_observer_children: yield name, submodule def iter_named_quantizable_modules( model: Module, include_children: bool = True, include_attn: bool = False ) -> Generator[Tuple[str, Module], None, None]: """ Yield name and submodule of - leaf modules, set by include_children - attention modyles, set by include_attn :param model: model to get leaf modules of :param include_children: flag to get the leaf modules :param inlcude_attn: flag to get the attention modules :returns: generator tuple of (name, submodule) """ for name, submodule in model.named_modules(): # TODO: verify if an observer would ever be attached in this case/remove check if include_children: children = list(submodule.children()) if len(children) == 0 and "observer" not in name: yield name, submodule else: if len(children) > 0: named_children, children = zip(*list(submodule.named_children())) has_non_observer_children = False for i in range(len(children)): child_name = named_children[i] if "observer" not in child_name: has_non_observer_children = True if not has_non_observer_children: yield name, submodule if include_attn: if name.endswith("self_attn"): yield name, submodule def get_torch_bit_depth(value: torch.Tensor) -> int: """ Determine the number of bits used to represent the dtype of a tensor :param value: tensor to check bit depth of :return: bit depth of each element in the value tensor """ try: bit_depth = torch.finfo(value.dtype).bits except TypeError: bit_depth = torch.iinfo(value.dtype).bits return bit_depth def can_quantize(value: torch.Tensor, quant_args: "QuantizationArgs") -> bool: # noqa """ Checks if value can be quantized by quant_args. :param value: tensor to check for quantization :param quant_args: QuantizationArgs to use for quantization :return: False if value is already quantized to quant_args or value is incompatible with quant_args, True if value can be quantized with quant_args """ bit_depth = get_torch_bit_depth(value) requested_depth = quant_args.num_bits if bit_depth < quant_args.num_bits: _LOGGER.warn( f"Can't quantize tensor with bit depth {bit_depth} to {requested_depth}." "The QuantizationArgs provided are not compatible with the input tensor." ) return bit_depth > quant_args.num_bits def calculate_compression_ratio(model: Module) -> float: """ Calculates the quantization compression ratio of a pytorch model, based on the number of bits needed to represent the total weights in compressed form. Does not take into account activation quantizatons. :param model: pytorch module to calculate compression ratio for :return: compression ratio of the whole model """ total_compressed = 0.0 total_uncompressed = 0.0 for name, submodule in tqdm( iter_named_leaf_modules(model), desc="Calculating quantization compression ratio", ): for parameter in model.parameters(): uncompressed_bits = get_torch_bit_depth(parameter) compressed_bits = uncompressed_bits if is_module_quantized(submodule) and submodule.quantization_scheme.weights: compressed_bits = submodule.quantization_scheme.weights.num_bits num_weights = parameter.numel() total_compressed += compressed_bits * num_weights total_uncompressed += uncompressed_bits * num_weights return total_uncompressed / total_compressed def is_kv_cache_quant_scheme(scheme: QuantizationScheme) -> bool: """ Check whether the QuantizationScheme targets the kv cache. It does if all the following criteria are met: - the scheme targets either exactly match the KV_CACHE_TARGETS or the match KV_CACHE_TARGETS regex pattern - the scheme quantizes output_activations (we want to quantize the outputs from the KV_CACHE_TARGETS, as their correspond to the keys and values that are to be saved in the cache) :param scheme: The QuantizationScheme to investigate :return: boolean flag """ for target in scheme.targets: if target in KV_CACHE_TARGETS: return True return False def parse_out_kv_cache_args( quant_scheme_to_layers: List[QuantizationScheme], ) -> Tuple[Optional[QuantizationArgs], List[QuantizationScheme]]: """ If possible, parse out the kv cache specific QuantizationArgs from the list of the QuantizationSchemes. If no kv cache specific QuantizationArgs available, this function acts as an identity function :param quant_scheme_to_layers: list of QuantizationSchemes :return: kv_cache_args (optional) and the (remaining or original) list of the QuantizationSchemes """ kv_cache_quant_scheme_to_layers = [ scheme for scheme in quant_scheme_to_layers if is_kv_cache_quant_scheme(scheme) ] quant_scheme_to_layers = [ scheme for scheme in quant_scheme_to_layers if not is_kv_cache_quant_scheme(scheme) ] if kv_cache_quant_scheme_to_layers: kv_cache_quant_scheme_to_layers = kv_cache_quant_scheme_to_layers[0] kv_cache_args = kv_cache_quant_scheme_to_layers.output_activations else: kv_cache_args = None return kv_cache_args, quant_scheme_to_layers compressed-tensors-0.9.4/src/compressed_tensors/registry/000077500000000000000000000000001500222531600237375ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/registry/__init__.py000066400000000000000000000012221500222531600260450ustar00rootroot00000000000000# flake8: noqa # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from .registry import * compressed-tensors-0.9.4/src/compressed_tensors/registry/registry.py000066400000000000000000000271701500222531600261700ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Universal registry to support registration and loading of child classes and plugins of neuralmagic utilities """ import importlib from collections import defaultdict from typing import Any, Dict, List, Optional, Type, Union __all__ = [ "RegistryMixin", "register", "get_from_registry", "registered_names", "registered_aliases", "standardize_lookup_name", ] _ALIAS_REGISTRY: Dict[Type, Dict[str, str]] = defaultdict(dict) _REGISTRY: Dict[Type, Dict[str, Any]] = defaultdict(dict) def standardize_lookup_name(name: str) -> str: """ Standardize the given name for lookup in the registry. This will replace all underscores and spaces with hyphens and convert the name to lowercase. example: ``` standardize_lookup_name("Foo_bar baz") == "foo-bar-baz" ``` :param name: name to standardize :return: standardized name """ return name.replace("_", "-").replace(" ", "-").lower() def standardize_alias_name( name: Union[None, str, List[str]] ) -> Union[None, str, List[str]]: if name is None: return None elif isinstance(name, str): return standardize_lookup_name(name) else: # isinstance(name, list) return [standardize_lookup_name(n) for n in name] class RegistryMixin: """ Universal registry to support registration and loading of child classes and plugins of neuralmagic utilities. Classes that require a registry or plugins may add the `RegistryMixin` and use `register` and `load` as the main entrypoints for adding new implementations and loading requested values from its registry. If a class should only have its child classes in its registry, the class should set the static attribute `registry_requires_subclass` to True example ```python class Dataset(RegistryMixin): pass # register with default name @Dataset.register() class ImageNetDataset(Dataset): pass # load as "ImageNetDataset" imagenet = Dataset.load("ImageNetDataset") # register with custom name @Dataset.register(name="cifar-dataset") class Cifar(Dataset): pass Note: the name will be standardized for lookup in the registry. For example, if a class is registered as "cifar_dataset" or "cifar dataset", it will be stored as "cifar-dataset". The user will be able to load the class with any of the three name variants. # register with multiple aliases @Dataset.register(alias=["cifar-10-dataset", "cifar_100_dataset"]) class Cifar(Dataset): pass # load as "cifar-dataset" cifar = Dataset.load_from_registry("cifar-dataset") # load from custom file that implements a dataset mnist = Dataset.load_from_registry("/path/to/mnnist_dataset.py:MnistDataset") ``` """ # set to True in child class to add check that registered/retrieved values # implement the class it is registered to registry_requires_subclass: bool = False @classmethod def register( cls, name: Optional[str] = None, alias: Union[List[str], str, None] = None ): """ Decorator for registering a value (ie class or function) wrapped by this decorator to the base class (class that .register is called from) :param name: name or list of names to register the wrapped value as, defaults to value.__name__ :param alias: alias or list of aliases to register the wrapped value as, defaults to None :return: register decorator """ def decorator(value: Any): cls.register_value(value, name=name, alias=alias) return value return decorator @classmethod def register_value( cls, value: Any, name: str, alias: Union[str, List[str], None] = None ): """ Registers the given value to the class `.register_value` is called from :param value: value to register :param name: name to register the wrapped value as, defaults to value.__name__ :param alias: alias or list of aliases to register the wrapped value as, defaults to None """ register( parent_class=cls, value=value, name=name, alias=alias, require_subclass=cls.registry_requires_subclass, ) @classmethod def load_from_registry(cls, name: str, **constructor_kwargs) -> object: """ :param name: name of registered class to load :param constructor_kwargs: arguments to pass to the constructor retrieved from the registry :return: loaded object registered to this class under the given name, constructed with the given kwargs. Raises error if the name is not found in the registry """ constructor = cls.get_value_from_registry(name=name) return constructor(**constructor_kwargs) @classmethod def get_value_from_registry(cls, name: str): """ :param name: name to retrieve from the registry :return: value from retrieved the registry for the given name, raises error if not found """ return get_from_registry( parent_class=cls, name=name, require_subclass=cls.registry_requires_subclass, ) @classmethod def registered_names(cls) -> List[str]: """ :return: list of all names registered to this class """ return registered_names(cls) @classmethod def registered_aliases(cls) -> List[str]: """ :return: list of all aliases registered to this class """ return registered_aliases(cls) def register( parent_class: Type, value: Any, name: Optional[str] = None, alias: Union[List[str], str, None] = None, require_subclass: bool = False, ): """ :param parent_class: class to register the name under :param value: the value to register :param name: name to register the wrapped value as, defaults to value.__name__ :param alias: alias or list of aliases to register the wrapped value as, defaults to None :param require_subclass: require that value is a subclass of the class this method is called from """ if name is None: # default name name = value.__name__ name = standardize_lookup_name(name) alias = standardize_alias_name(alias) register_alias(name=name, alias=alias, parent_class=parent_class) if require_subclass: _validate_subclass(parent_class, value) if name in _REGISTRY[parent_class]: # name already exists - raise error if two different values are attempting # to share the same name registered_value = _REGISTRY[parent_class][name] if registered_value is not value: raise RuntimeError( f"Attempting to register name {name} as {value} " f"however {name} has already been registered as {registered_value}" ) else: _REGISTRY[parent_class][name] = value def get_from_registry( parent_class: Type, name: str, require_subclass: bool = False ) -> Any: """ :param parent_class: class that the name is registered under :param name: name to retrieve from the registry of the class :param require_subclass: require that value is a subclass of the class this method is called from :return: value from retrieved the registry for the given name, raises error if not found """ name = standardize_lookup_name(name) if ":" in name: # user specifying specific module to load and value to import module_path, value_name = name.split(":") retrieved_value = _import_and_get_value_from_module(module_path, value_name) else: # look up name in alias registry name = _ALIAS_REGISTRY[parent_class].get(name, name) # look up name in registry retrieved_value = _REGISTRY[parent_class].get(name) if retrieved_value is None: raise KeyError( f"Unable to find {name} registered under type {parent_class}.\n" f"Registered values for {parent_class}: " f"{registered_names(parent_class)}\n" f"Registered aliases for {parent_class}: " f"{registered_aliases(parent_class)}" ) if require_subclass: _validate_subclass(parent_class, retrieved_value) return retrieved_value def registered_names(parent_class: Type) -> List[str]: """ :param parent_class: class to look up the registry of :return: all names registered to the given class """ return list(_REGISTRY[parent_class].keys()) def registered_aliases(parent_class: Type) -> List[str]: """ :param parent_class: class to look up the registry of :return: all aliases registered to the given class """ registered_aliases_plus_names = list(_ALIAS_REGISTRY[parent_class].keys()) registered_aliases = list( set(registered_aliases_plus_names) - set(registered_names(parent_class)) ) return registered_aliases def register_alias( name: str, parent_class: Type, alias: Union[str, List[str], None] = None ): """ Updates the mapping from the alias(es) to the given name. If the alias is None, the name is used as the alias. ``` :param name: name that the alias refers to :param parent_class: class that the name is registered under :param alias: single alias or list of aliases that refer to the name, defaults to None """ if alias is not None: alias = alias if isinstance(alias, list) else [alias] else: alias = [] if name in alias: raise KeyError( f"Attempting to register alias {name}, " f"that is identical to the standardized name: {name}." ) alias.append(name) for alias_name in alias: if alias_name in _ALIAS_REGISTRY[parent_class]: raise KeyError( f"Attempting to register alias {alias_name} as {name} " f"however {alias_name} has already been registered as " f"{_ALIAS_REGISTRY[alias_name]}" ) _ALIAS_REGISTRY[parent_class][alias_name] = name def _import_and_get_value_from_module(module_path: str, value_name: str) -> Any: # import the given module path and try to get the value_name if it is included # in the module # load module spec = importlib.util.spec_from_file_location( f"plugin_module_for_{value_name}", module_path ) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) # get value from module value = getattr(module, value_name, None) if not value: raise RuntimeError( f"Unable to find attribute {value_name} in module {module_path}" ) return value def _validate_subclass(parent_class: Type, child_class: Type): if not issubclass(child_class, parent_class): raise ValueError( f"class {child_class} is not a subclass of the class it is " f"registered for: {parent_class}." ) compressed-tensors-0.9.4/src/compressed_tensors/utils/000077500000000000000000000000001500222531600232275ustar00rootroot00000000000000compressed-tensors-0.9.4/src/compressed_tensors/utils/__init__.py000066400000000000000000000014501500222531600253400ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa from .helpers import * from .offload import * from .permutations_24 import * from .permute import * from .safetensors_load import * from .semi_structured_conversions import * compressed-tensors-0.9.4/src/compressed_tensors/utils/helpers.py000066400000000000000000000242571500222531600252550ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings from functools import wraps from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional import numpy import torch from transformers import AutoConfig if TYPE_CHECKING: from compressed_tensors.compressors import ModelCompressor __all__ = [ "infer_compressor_from_model_config", "fix_fsdp_module_name", "tensor_follows_mask_structure", "replace_module", "is_compressed_tensors_config", "getattr_chain", "deprecated", "Aliasable", "combine_shards", "shard_tensor", "pack_bitmasks", "unpack_bitmasks", ] FSDP_WRAPPER_NAME = "_fsdp_wrapped_module" def infer_compressor_from_model_config( pretrained_model_name_or_path: str, ) -> Optional["ModelCompressor"]: # noqa: F821 """ Given a path to a model config, extract a sparsity config if it exists and return the associated ModelCompressor :param pretrained_model_name_or_path: path to model config on disk or HF hub :return: matching compressor if config contains a sparsity config """ from compressed_tensors.compressors import ModelCompressor from compressed_tensors.config import CompressionConfig config = AutoConfig.from_pretrained(pretrained_model_name_or_path) sparsity_config = ModelCompressor.parse_sparsity_config(config) if sparsity_config is None: return None format = sparsity_config.get("format") sparsity_config = CompressionConfig.load_from_registry(format, **sparsity_config) compressor = ModelCompressor.load_from_registry(format, config=sparsity_config) return compressor # TODO: There is already the same function in # SparseML, should be moved to a shared location # in the future def fix_fsdp_module_name(name: str) -> str: """ Remove FSDP wrapper prefixes from a module name Accounts for scenario where FSDP_WRAPPER_NAME is at the end of the name, as well as in the middle. :param name: name to strip :return: stripped name """ return name.replace(FSDP_WRAPPER_NAME + ".", "").replace( "." + FSDP_WRAPPER_NAME, "" ) def tensor_follows_mask_structure(tensor, mask: str = "2:4") -> bool: """ :param tensor: tensor to check :param mask: mask structure to check for, in the format "n:m" :return: True if the tensor follows the mask structure, False otherwise. Note, some weights can incidentally be zero, so we check for atleast n zeros in each chunk of size m """ n, m = tuple(map(int, mask.split(":"))) # Reshape the tensor into chunks of size m tensor = tensor.view(-1, m) # Count the number of zeros in each chunk zero_counts = (tensor == 0).sum(dim=1) # Check if the number of zeros in each chunk atleast n # Greater than sign is needed as some weights can incidentally # be zero if not torch.all(zero_counts >= n).item(): raise ValueError() return True def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module): if "." in name: parent_name = name.rsplit(".", 1)[0] child_name = name[len(parent_name) + 1 :] parent = model.get_submodule(parent_name) else: parent_name = "" parent = model child_name = name setattr(parent, child_name, new_module) def is_compressed_tensors_config(compression_config: Any) -> bool: """ Returns True if CompressedTensorsConfig is available from transformers and compression_config is an instance of CompressedTensorsConfig See: https://github.com/huggingface/transformers/pull/31704 """ try: from transformers.utils.quantization_config import CompressedTensorsConfig return isinstance(compression_config, CompressedTensorsConfig) except ImportError: return False def getattr_chain(obj: Any, chain_str: str, *args, **kwargs) -> Any: """ Chain multiple getattr calls, separated by `.` :param obj: base object whose attributes are being retrieved :param chain_str: attribute names separated by `.` :param default: default value, throw error otherwise """ if len(args) >= 1: has_default = True default = args[0] elif "default" in kwargs: has_default = True default = kwargs["default"] else: has_default = False attr_names = chain_str.split(".") res = obj for attr_name in attr_names: if not hasattr(res, attr_name): if has_default: return default else: raise AttributeError(f"{res} object has no attribute {attr_name}") res = getattr(res, attr_name) return res def deprecated(future_name: Optional[str] = None, message: Optional[str] = None): """ Decorator to mark functions as deprecated :param new_function: Function called in place of deprecated function :param message: Deprecation message, replaces default deprecation message """ def decorator(func: Callable[[Any], Any]): nonlocal message if message is None: message = ( f"{func.__name__} is deprecated and will be removed in a future release" ) if future_name is not None: message += f". Please use {future_name} instead." @wraps(func) def wrapped(*args, **kwargs): warnings.warn(message, DeprecationWarning, stacklevel=2) return func(*args, **kwargs) return wrapped return decorator class Aliasable: """ A mixin for enums to allow aliasing of enum members Example: >>> class MyClass(Aliasable, int, Enum): >>> ... """ @staticmethod def get_aliases() -> Dict[str, str]: raise NotImplementedError() def __eq__(self, other): if isinstance(other, self.__class__): aliases = self.get_aliases() return self.value == other.value or ( aliases.get(self.value, self.value) == aliases.get(other.value, other.value) ) else: aliases = self.get_aliases() self_value = aliases.get(self.value, self.value) other_value = aliases.get(other, other) return self_value == other_value def __hash__(self): canonical_value = self.aliases.get(self.value, self.value) return hash(canonical_value) def shard_tensor( tensor: torch.Tensor, shard_sizes: List[int], dim: int = 0 ) -> List[torch.Tensor]: """ Shards a tensor into a list of tensors along a given dimension. raises: ValueError: If the sum of shard_sizes does not match the size of the tensor along the given dimension. :param tensor: The input tensor to shard. :param shard_sizes : List of sizes for each shard along the specified dimension. :param dim : The dimension along which to shard the tensor. :returns: A list of tensors sharded along the specified dimension. """ if sum(shard_sizes) != tensor.size(dim): raise ValueError( "Sum of shard_sizes must equal the size of the tensor " "along the specified dimension." ) shards = [] start_idx = 0 for size in shard_sizes: end_idx = start_idx + size shard = tensor.narrow(dim, start_idx, size) shards.append(shard) start_idx = end_idx return shards def combine_shards(shards, dim=0): """ Combine decompressed shards along a given dimension using `narrow`. :param shards: List of decompressed shard tensors. :param dim: Dimension to combine along (default: 0). :return: Combined decompressed tensor. """ if not shards: raise ValueError("The list of shards is empty.") # Assert that all shards have the same dtype shard_dtypes = {shard.dtype for shard in shards} if len(shard_dtypes) > 1: raise ValueError("All shards must have the same dtype.") # Determine the total shape of the combined tensor total_shape = list(shards[0].shape) total_shape[dim] = sum(shard.shape[dim] for shard in shards) # Create the combined tensor combined = torch.zeros(total_shape, dtype=shards[0].dtype, device=shards[0].device) # Fill the combined tensor using narrow shard_offset = 0 for shard in shards: shard_size = shard.shape[dim] combined.narrow(dim, shard_offset, shard_size).copy_(shard) shard_offset += shard_size return combined def pack_bitmasks(bytemasks: torch.Tensor) -> torch.Tensor: """ Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be compressed to R x ceil(C/8) :param bytemasks: mask tensor where each byte corresponds to a weight :return: mask tensor where each bit corresounds to a weight """ packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little") packed_bits_torch = torch.from_numpy(packed_bits_numpy) return packed_bits_torch def unpack_bitmasks( packed_bitmasks: torch.Tensor, original_shape: List[int] ) -> torch.Tensor: """ Converts a bitmask tensor back to a bytemask tensor for use during decompression :param packed_bitmasks: mask tensor where each bit corresponds to a weight :param original_shape: dense shape to decompress to :return: boolean mask of weights in the original dense shape """ # Unpack the bits unpacked_bits = numpy.unpackbits( packed_bitmasks.cpu().numpy(), axis=-1, count=original_shape[-1], bitorder="little", ) # Reshape to match the original shape unpacked_bitmasks_torch = torch.from_numpy( unpacked_bits.reshape(original_shape).astype(bool) ) return unpacked_bitmasks_torch compressed-tensors-0.9.4/src/compressed_tensors/utils/offload.py000066400000000000000000000335111500222531600252160ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utilities associated with offloading functionality provided by `accelerate`. | ----------------------------------------------------------------------------------------------------- | # noqa: E501 | Operation | Without offloading support | With offloading support | # noqa: E501 | --------- | -------------------------------------- | ------------------------------------------------ | # noqa: E501 | Add | module.register_parameter(name, param) | register_offload_parameter(module, name, param) | # noqa: E501 | Check | N/A | has_offloaded_params(module) | # noqa: E501 | Onload | N/A | with align_module_device(module) | # noqa: E501 | Update | module.name.data.copy_(new_data) | update_offload_parameter(module, name, new_data) | # noqa: E501 | Delete | del module.name | delete_offload_parameter(module, name) | # noqa: E501 | ----------------------------------------------------------------------------------------------------- | # noqa: E501 """ import contextlib import warnings from functools import wraps from typing import Any, Callable, Dict, Literal, Optional, Union import torch try: from accelerate.hooks import ( AlignDevicesHook, add_hook_to_module, remove_hook_from_module, ) from accelerate.utils import ( OffloadedWeightsLoader, PrefixedDataset, set_module_tensor_to_device, ) _has_accelerate = True except ImportError: _has_accelerate = False AlignDevicesHook = None add_hook_to_module = None remove_hook_from_module = None OffloadedWeightsLoader = None PrefixedDataset = None set_module_tensor_to_device = None __all__ = [ "is_module_offloaded", "get_execution_device", "get_offloaded_device", "update_prefix_dict", "update_parameter_data", "register_offload_parameter", "update_offload_parameter", "delete_offload_parameter", "has_offloaded_params", "disable_hf_hook", "align_module_device", ] def check_accelerate(fallback: Any): def decorator(func: Callable[[Any], Any]): if not _has_accelerate: @wraps(func) def fallback_fn(*args, **kwargs): return fallback return fallback_fn return func return decorator """ Candidates for Depreciation """ @check_accelerate(fallback=False) def is_module_offloaded(module: torch.nn.Module) -> bool: return has_offloaded_params(module) def get_offloaded_device(module: torch.nn.Module) -> torch.device: """ :param module: module to check :return: device module is offloaded to onto after forward pass """ if has_offloaded_params(module): first_key = list(module._hf_hook.weights_map.keys())[0] prefix_dataset = module._hf_hook.weights_map.dataset return prefix_dataset[first_key].device return next(module.parameters()).device @check_accelerate(fallback=None) def update_prefix_dict(module: torch.nn.Module, key: str, data: torch.Tensor): """ Updates the offloaded state dict for a given module. Parameter named key is replaced by data. This is neccesary because parameter updates for offloaded modules do not persist automatically between loads. This function only affects the offloaded state dict and not the current state of the loaded module. :param module: module containing the parameter to update :param key: name of parameter to update :param data: tensor to update parameter with in the offloaded state dict """ if not has_offloaded_params(module): raise ValueError("Prefix dict is only applicable to offloaded modules") weights_map = module._hf_hook.weights_map offload_to_weights_map(weights_map, key, data) def update_parameter_data( module: torch.nn.Module, new_param_data: torch.Tensor, param_name: str ): """ Update the data of an existing parameter and its offload dict. Supports both parameters of offloaded modules and non-offloaded modules :param module: module containing the parameter to update :param new_param_data: tensor to update parameter with :param param_name: name of module parameter to update """ update_offload_parameter(module, param_name, new_param_data) """ Candidates for Upstreaming """ def get_execution_device(module: torch.nn.Module) -> torch.device: """ Get the device which inputs should be moved to before module execution :param module: module to check, may be offloaded :return: onload device of module """ if has_offloaded_params(module): return module._hf_hook.execution_device first_param = next(module.parameters(), None) if first_param is None: warnings.warn( f"Unable able to infer execution device of {module}, falling back to CPU" ) return torch.device("cpu") return first_param.device def register_offload_parameter( module: torch.nn.Module, name: str, parameter: torch.nn.Parameter, offload_device: Optional[Union[torch.device, Literal["disk"]]] = None, ): """ Register a parameter to the given module which may be offloaded :param module: maybe offloaded module :param name: name of newly registered parameter :param parameter: parameter being registered :param offload_device: device on which weight will be offloaded to. If None is provided, then infer device from parameters on module """ has_onload = any(p.device != torch.device("meta") for p in module.parameters()) module.register_parameter(name, parameter) if has_offloaded_params(module): weights_map = module._hf_hook.weights_map offload_to_weights_map(weights_map, name, parameter.data, offload_device) if not has_onload: set_module_tensor_to_device(module, name, "meta") def update_offload_parameter( module: torch.nn.Module, name: str, data: Optional[torch.Tensor], offload_device: Optional[Union[torch.device, Literal["disk"]]] = None, ): """ Update the data of an existing parameter and its offload dict. Supports both parameters of offloaded modules and non-offloaded modules :param module: module containing the parameter to update :param name: name of module parameter to update :param data: tensor to update parameter with :param offload_device: device on which weight will be offloaded to. If None is provided, then infer device from parameters on module """ param = getattr(module, name) if param.data.shape != data.shape: warnings.warn( f"Shape of parameter being updated {param.data.shape} does not match shape " f"of update data {data.shape}" ) # copy data into onloaded parameter if applicable if param.device != torch.device("meta"): param.data.copy_(data) # update offload dict if has_offloaded_params(module): weights_map = module._hf_hook.weights_map offload_to_weights_map(weights_map, name, data, offload_device) def delete_offload_parameter(module: torch.nn.Module, name: str): """ Delete a parameter from a module which may be offloaded :param module: maybe offloaded module :param name: name of parameter being deleted """ delattr(module, name) if has_offloaded_params(module): weights_map = module._hf_hook.weights_map delete_from_weights_map(weights_map, name) @check_accelerate(fallback=contextlib.nullcontext()) @contextlib.contextmanager def disable_hf_hook(module: torch.nn.Module): hooks = {} def collect_hooks(module): nonlocal hooks if hasattr(module, "_hf_hook"): hooks[module] = module._hf_hook remove_hook_from_module(module) module.apply(collect_hooks) yield for submodule, hook in hooks.items(): add_hook_to_module(submodule, hook) @check_accelerate(fallback=None) def offload_to_weights_map( weights_map: Union[PrefixedDataset, Dict, OffloadedWeightsLoader], key: str, value: torch.Tensor, offload_device: Optional[Union[torch.device, Literal["disk"]]] = None, ): """ Helper function which implements offloaded item assignment for PrefixedDataset, OffloadedWeightsLoader, and Dict types. :param weights_map: weight map to be updated with offload information :param key: key used to identify weight location :param value: weight being offloaded :param offload_device: device on which weight will be offloaded to. If None is provided, then infer device from parameters in weights_map """ if isinstance(weights_map, PrefixedDataset): if offload_device == "disk": raise ValueError(f"Cannot offload to disk with type {type(weights_map)}") dataset = weights_map.dataset key = f"{weights_map.prefix}{key}" offload_to_weights_map(dataset, key, value, offload_device) elif isinstance(weights_map, OffloadedWeightsLoader): if key not in weights_map.all_keys: weights_map.all_keys.append(key) if len(weights_map.index) <= 0 and offload_device != "disk": offload_to_weights_map(weights_map.state_dict, key, value, offload_device) else: raise NotImplementedError( "Updating weights_map with disk offloading is not implemented yet" ) elif isinstance(weights_map, dict): if offload_device == "disk": raise ValueError(f"Cannot offload to disk with type {type(weights_map)}") # infer offload device if offload_device is None: if key in weights_map: offload_device = weights_map[key].device else: tens = next(iter(weights_map.values()), None) if tens is None: raise ValueError( "Cannot infer offload device from empty weights_map" ) offload_device = tens.device weights_map[key] = value.to(device=offload_device) else: raise NotImplementedError( "Updating offload data not implemented for weights_map of type " f"{type(weights_map)}" ) @check_accelerate(fallback=None) def delete_from_weights_map( weights_map: Union[PrefixedDataset, Dict, OffloadedWeightsLoader], key: str, ): if isinstance(weights_map, PrefixedDataset): dataset = weights_map.dataset key = f"{weights_map.prefix}{key}" delete_from_weights_map(dataset, key) elif isinstance(weights_map, OffloadedWeightsLoader): if len(weights_map.index) <= 0: delete_from_weights_map(weights_map.state_dict, key) else: raise NotImplementedError( "Delete from weights_map with disk offloading is not implemented yet" ) elif isinstance(weights_map, dict): del weights_map[key] else: raise NotImplementedError( "Updating offload data not implemented for weights_map of type " f"{type(weights_map)}" ) """ Upstreamed Functions """ # introduced in accelerate v1.1.0 @check_accelerate(fallback=False) def has_offloaded_params(module: torch.nn.Module) -> bool: """ Checks if a module has offloaded parameters by checking if the given module has a AlignDevicesHook attached with offloading enabled Args: module (`torch.nn.Module`): The module to check for an offload hook. Returns: bool: `True` if the module has an offload hook and offloading is enabled, `False` otherwise. """ return ( hasattr(module, "_hf_hook") and isinstance(module._hf_hook, AlignDevicesHook) and module._hf_hook.offload ) # introduced in accelerate v1.1.0 @check_accelerate(fallback=contextlib.nullcontext()) @contextlib.contextmanager def align_module_device( module: torch.nn.Module, execution_device: Optional[torch.device] = None ): """ Context manager that moves a module's parameters to the specified execution device. Args: module (`torch.nn.Module`): Module with parameters to align. execution_device (`torch.device`, *optional*): If provided, overrides the module's execution device within the context. Otherwise, use hook execution device or pass """ if has_offloaded_params(module): if execution_device is not None: original_device = module._hf_hook.execution_device module._hf_hook.execution_device = execution_device try: module._hf_hook.pre_forward(module) yield finally: module._hf_hook.post_forward(module, None) if execution_device is not None: module._hf_hook.execution_device = original_device elif execution_device is not None: devices = { name: param.device for name, param in module.named_parameters(recurse=False) } try: for name in devices: set_module_tensor_to_device(module, name, execution_device) yield finally: for name, device in devices.items(): set_module_tensor_to_device(module, name, device) else: yield compressed-tensors-0.9.4/src/compressed_tensors/utils/permutations_24.py000066400000000000000000000047421500222531600266470ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy import torch __all__ = ["get_permutations_24"] # Precompute permutations for Marlin24 weight and scale shuffling # Originally implemented in nm-vllm/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py # noqa: E501 # # Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight # data so that it is compatible with the tensor-core format that is described here: # https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501 # # As a result of this reordering, the vector loads inside the kernel will get the data # as it is needed for tensor-core (without the need to use ldmatrix instructions) def get_permutations_24(num_bits): perm_list = [] for i in range(32): perm1 = [] col = i // 4 col_o = col // 2 for block in [0, 1]: for row in [ 2 * (i % 4), 2 * (i % 4) + 1, 2 * (i % 4 + 4), 2 * (i % 4 + 4) + 1, ]: perm1.append(16 * row + col_o * 256 + 8 * (col % 2) + 4 * block) for j in range(4): perm_list.extend([p + 1 * j for p in perm1]) perm = numpy.array(perm_list) if num_bits == 4: interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) elif num_bits == 8: interleave = numpy.array([0, 2, 1, 3]) else: raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits)) perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) scale_perm = [] for i in range(8): scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) scale_perm_single = [] for i in range(8): scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) return perm, scale_perm, scale_perm_single compressed-tensors-0.9.4/src/compressed_tensors/utils/permute.py000066400000000000000000000044631500222531600252710ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Set, Tuple import torch __all__ = ["safe_permute"] # these datatypes are missing implementations required for standard permutation _EXPERIMENTAL_DTYPES: Set[Tuple[torch.dtype, torch.device]] = set() def safe_permute(value: torch.Tensor, perm: torch.Tensor, dim: int = 0) -> torch.Tensor: """ Perform out-of-place permutation without using torch.Tensor.index_put_, whose implementation is missing for datatypes such as `torch.float8_e4m3fn` :param value: tensor to permute :param perm: permutation map :param dim: dimension along which to apply permutation :return: permuted value """ dtype_tuple = (value.dtype, value.device) if dtype_tuple in _EXPERIMENTAL_DTYPES: return _fallback_permute(value, perm, dim) try: return value[tuple([slice(None)] * dim + [perm])] except RuntimeError: # Mark dtype as experimental if advanced indexing fails _EXPERIMENTAL_DTYPES.add(dtype_tuple) return _fallback_permute(value, perm, dim) def _fallback_permute( value: torch.Tensor, perm: torch.Tensor, dim: int ) -> torch.Tensor: """ Fallback permutation method for experimental dtypes. :param value: tensor to permute :param perm: permutation map :param dim: dimension along which to apply permutation :return: permuted value """ value_ret = value.clone() # cannot use zeros_like b/c of missing impl. orig_slices = [slice(None)] * (dim + 1) perm_slices = [slice(None)] * (dim + 1) for index, perm_index in enumerate(perm): orig_slices[dim] = index perm_slices[dim] = perm_index value_ret[tuple(orig_slices)] = value[tuple(perm_slices)] return value_ret compressed-tensors-0.9.4/src/compressed_tensors/utils/safetensors_load.py000066400000000000000000000263531500222531600271450ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import os import re import struct from typing import Dict, Iterable, Optional, Tuple, Union from safetensors import safe_open from torch import Tensor from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, cached_file __all__ = [ "get_safetensors_folder", "get_safetensors_header", "match_param_name", "merge_names", "get_weight_mappings", "get_nested_weight_mappings", "get_nested_mappings_from_state_dict", "get_quantization_parameter_to_path_mapping", "is_quantization_param", ] WeightMappingType = Dict[str, str] NestedWeightMappingType = Dict[str, WeightMappingType] def get_safetensors_folder( pretrained_model_name_or_path: str, cache_dir: Optional[str] = None ) -> str: """ Given a Hugging Face stub or a local path, return the folder containing the safetensors weight files :param pretrained_model_name_or_path: local path to model or HF stub :param cache_dir: optional cache dir to search through, if none is specified the model will be searched for in the default TRANSFORMERS_CACHE :return: local folder containing model data """ if os.path.exists(pretrained_model_name_or_path): # argument is a path to a local folder return os.path.abspath(pretrained_model_name_or_path) safetensors_path = cached_file( pretrained_model_name_or_path, SAFE_WEIGHTS_NAME, cache_dir=cache_dir, _raise_exceptions_for_missing_entries=False, ) index_path = cached_file( pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, cache_dir=cache_dir, _raise_exceptions_for_missing_entries=False, ) if safetensors_path is not None: # found a single cached safetensors file return os.path.split(safetensors_path)[0] if index_path is not None: # found a cached safetensors weight index file return os.path.split(index_path)[0] # model weights could not be found locally or cached from HF Hub raise ValueError( "Could not locate safetensors weight or index file from " f"{pretrained_model_name_or_path}." ) def get_safetensors_header(safetensors_path: str) -> Dict[str, str]: """ Extracts the metadata from a safetensors file as JSON :param safetensors_path: path to a safetensors file :return: dictionary of metadata extracted from the safetensors file """ with open(safetensors_path, "rb") as f: length_of_header = struct.unpack(" Optional[str]: """ Helper function extracting the uncompressed parameterized layer name from a compressed name. Assumes the compressed name was merged using merge_names. :param full_name: full name of parameter in compressed model :param param_name: compression paramater name :return: uncompressed name of the uncompressed parameterized layer """ pattern = r"^(.*)\." + param_name + r"$" regex = re.findall(pattern, full_name) if len(regex) == 0: return None return regex[0] def merge_names(parent_name: str, child_name: str) -> str: """ Helper function for merging an uncompressed parameterized layer name with a compression parameter. Names merged with this function can then be parsed by match_param_name. :param parent_name: uncompressed parameterized layer name :param child_name: compression parameter name :return: merged compressed name """ return parent_name + "." + child_name def get_weight_mappings(path_to_model_or_tensors: str) -> Dict[str, str]: """ Takes a path to a state dict saved in safetensors format and returns a mapping from parameterized layer name to file location. { layer.weight.bitmask: file_location, layer.weight.row_offsets: file_location, layer.weight.shape: file_location, layer.weight.compressed: file_location } This generalizes to cases where the model is split into multiple safetensors files :param path_to_model_or_tensors: path to directory that contains safetensors (must contain either a single file or multiple files with an index), or a path to a single safetensors file :return: mapping of parameterized layer name to file location """ if os.path.isfile(path_to_model_or_tensors): # we have a single safetensors file to read header = get_safetensors_header(path_to_model_or_tensors) for key in header.keys(): header[key] = path_to_model_or_tensors header.pop("__metadata__", None) else: # we have a directory with multiple safetensors files safetensors_path = os.path.join(path_to_model_or_tensors, SAFE_WEIGHTS_NAME) index_path = os.path.join(path_to_model_or_tensors, SAFE_WEIGHTS_INDEX_NAME) if os.path.exists(safetensors_path): # we have a single safetensors file to read header = get_safetensors_header(safetensors_path) for key in header.keys(): header[key] = SAFE_WEIGHTS_NAME header.pop("__metadata__", None) elif os.path.exists(index_path): # we have multiple safetensors file, read from index with open(index_path, "r", encoding="utf-8") as f: index = json.load(f) header = index["weight_map"] else: raise ValueError( "Could not find a safetensors weight " f"or index file at {path_to_model_or_tensors}" ) # convert weight locations to full paths for key, value in header.items(): header[key] = os.path.join(path_to_model_or_tensors, value) return header def get_nested_weight_mappings( model_path: str, params_to_nest: Iterable[str], return_unmatched_params: bool = False, ) -> Union[NestedWeightMappingType, Tuple[NestedWeightMappingType, WeightMappingType]]: """ Takes a path to a state dict saved in safetensors format and returns a nested mapping from uncompressed parameterized layer names to the file locations of each layer's compression parameters. Example of the nested mapping: layer: { bitmask: file_location, row_offsets: file_location, shape: file_location, compressed: file_location } If other parameters are found that do not match the nested parameters, they will be returned in a separate dictionary only if return_unmatched_params is True. This dictionary may be needed for cases where compressors are stacked (e.g., quantization compression followed by sparse compression). Example of the unmatched params mapping: { layer.weight_scale: file_location, layer.input_scale: file_location } This generalizes to cases where the model is split into multiple safetensors files. :param model_path: Path to the safetensors state dict, must contain either a single safetensors file or multiple files with an index. :param params_to_nest: Iterable of parameter names to nest. :param return_unmatched_params: If True, return a second dictionary containing the remaining parameters that were not matched to the params_to_nest. :return: - If return_unmatched_params is False: NestedWeightMappingType: A nested mapping of parameterized layer names to file locations of each layer's compression parameters. - If return_unmatched_params is True: Tuple[NestedWeightMappingType, WeightMappingType]: A tuple containing: - NestedWeightMappingType: A nested mapping of parameterized layer names to file locations of each layer's compression parameters. - WeightMappingType: A mapping of the remaining parameter names to their file locations that were not matched to the params_to_nest. """ weight_mappings = get_weight_mappings(model_path) nested_weight_mappings = {} unmatched_params = {} for key, file_location in weight_mappings.items(): matched = False for param_name in params_to_nest: dense_param = match_param_name(key, param_name) if dense_param: if dense_param not in nested_weight_mappings: nested_weight_mappings[dense_param] = {} nested_weight_mappings[dense_param][param_name] = file_location matched = True if return_unmatched_params and not matched: unmatched_params[key] = file_location if return_unmatched_params: return nested_weight_mappings, unmatched_params return nested_weight_mappings def get_nested_mappings_from_state_dict( state_dict, params_to_nest: Iterable[str] ) -> NestedWeightMappingType: """ Takes a state dict and returns a nested mapping from uncompressed parameterized layer names to the value of each layer's compression parameters. Example of the nested mapping: layer: { weight_scale: ..., weight: ..., zero_point: ..., } :param state_dict: state dict of the model :param params_to_nest: Iterable of parameter names to nest. :return: Nested mapping of parameterized layer names to the value of each layer's compression parameters. """ nested_weight_mappings = {} for key in state_dict.keys(): for param_name in params_to_nest: dense_param = match_param_name(key, param_name) if dense_param: if dense_param not in nested_weight_mappings: nested_weight_mappings[dense_param] = {} nested_weight_mappings[dense_param][param_name] = state_dict[key] return nested_weight_mappings def get_quantization_parameter_to_path_mapping(model_path: str) -> Dict[str, str]: """ Given a model path, return a mapping between a parameter and its path on disk """ weight_mappings = get_weight_mappings(model_path) mapping = {} for weight_name, safe_path in weight_mappings.items(): if is_quantization_param(weight_name): mapping[weight_name] = safe_path continue return mapping def is_quantization_param(name: str) -> bool: """ Checks is a parameter name is associated with a quantization parameter :param name: parameter name to check :return: True if parameter name is a quantization parameter, else False """ if name.endswith("_scale"): return True if name.endswith("zero_point"): return True if name.endswith("g_idx"): return True return False compressed-tensors-0.9.4/src/compressed_tensors/utils/semi_structured_conversions.py000066400000000000000000000322611500222531600314560ustar00rootroot00000000000000# # Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es). # Pulled from nm-vllm/vllm/model_executor/layers/quantization/utils/format_24.py # # flake8: noqa # isort: skip_file # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch __all__ = [ "sparse_semi_structured_from_dense_cutlass", "sparse_semi_structured_to_dense_cutlass", "mask_creator", ] # This is PyTorch implementation of main part of reorder_meta() # function, from tools/util/include/cutlass/util/host_reorder.h file # of CUTLASS source tree. Furthermore, CUTLASS template for sparse # GEMM decides upon layout of this matrix, and at the moment for the # sparse GEMM executed on tensor cores, this is layout described by # ColumnMajorInterleaved<2> data structure, in # include/cutlass/layout/matrix.h of CUTLASS source tree. The # reordering of meta matrix into meta_reordered matrix calculated # according to these segments of CUTLASS code is re-implemented here. # Note that this calculation produces offsets for scattering metadata # matrix elements into reordered metadata matrix elements (or, # equivalently, for gathering reordered metadata matrix element back # into metadata matrix elements). def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, device): dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols) dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1) # Reorder the rows, then swizzle the 2x2 blocks. group_x = 64 group_y = 32 if meta_dtype.itemsize == 2 else 16 dst_rows = ( dst_rows // group_x * group_x + (dst_rows % 2) * 2 + (dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 + ((dst_rows % group_x) // 8) * 4 ) topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8) bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8) dst_rows += topright - bottomleft dst_cols -= topright - bottomleft # Assumed that meta tensor is to be stored in CUTLASS # InterleavedColumnMajor layout, and reverse engineered # corresponding code to store values into this tensor. interleave = 2 cols_maj = dst_cols // interleave cols_min = dst_cols % interleave return (cols_maj * m * interleave + dst_rows * interleave + cols_min).view(-1) # This function converts dense matrix into sparse semi-structured # representation, producing "compressed" matrix, in the layout used by # CUTLASS backend, and corresponding metadata matrix. def sparse_semi_structured_from_dense_cutlass(dense): if dense.dim() != 2: raise RuntimeError( f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor" # noqa: E501 ) m, k = dense.shape device = dense.device meta_dtype = torch.int8 if dense.dtype == torch.int8: meta_dtype = torch.int32 elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]: meta_dtype = torch.int16 else: raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix") quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4 if quadbits_per_meta_elem not in (4, 8): raise RuntimeError("Invalid number of elements per meta element calculated") if meta_dtype == torch.int32: if m % 16 != 0: raise RuntimeError( f"Number of rows of dense matrix {m} must be divisible by 16" ) else: if m % 32 != 0: raise RuntimeError( f"Number of rows of dense matrix {m} must be divisible by 32" ) if k % (4 * quadbits_per_meta_elem) != 0: raise RuntimeError( f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}" # noqa: E501 ) if dense.dtype != torch.float: ksparse = 4 dense_4 = dense.view(-1, k // ksparse, ksparse) m0, m1, m2, m3 = (dense_4 != 0).unbind(-1) else: ksparse = 2 dense_2 = dense.view(-1, k // ksparse, ksparse) m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1) meta_ncols = k // (ksparse * quadbits_per_meta_elem) # Encoding quadruples of True/False values as follows: # [True, True, False, False] -> 0b0100 # [True, False, True, False] -> 0b1000 # [False, True, True, False] -> 0b1001 # [True, False, False, True ] -> 0b1100 # [False, True, False, True ] -> 0b1101 # [False, False, True, True ] -> 0b1110 # Thus, lower two bits in the encoding are index of the True value # at the lowest index in the quadruple, and the higher two bits in # the encoding are index of the other True value in the quadruple. # In case there are less than two True values, than False value or # values at some index or indices are considered True for the # encoding. In case there are more than two True values, then the # excess True value(s) at some indices are considered False for # the encoding. The exact encodings used for these cases are as # follows: # [False, False, False, False] -> 0b1110 # [False, False, False, True ] -> 0b1110 # [False, False, True, False] -> 0b1110 # [False, True, False, False] -> 0b1001 # [False, True, True, True ] -> 0b1101 # [True, False, False, False] -> 0b1000 # [True, False, True, True ] -> 0b1100 # [True, True, False, True ] -> 0b0100 # [True, True, True, False] -> 0b0100 # [True, True, True, True ] -> 0b0100 # These particular encodings are chosen, with the help of Espresso # logic minimizer software, for the purpose of minimization of # corresponding Boolean functions, that translate non-zero flags # into encoding bits. Note also possible choices for the first # and last of these encodings were limited only to (0b0100, # 0b1110), in order to produce valid encodings for 1:2 sparsity # case. expr0 = m0 & m1 expr1 = ~m0 & m1 expr2 = ~m0 & ~m1 bit0 = expr1 bit1 = expr2 bit2 = expr0 | expr2 | m3 bit3 = expr1 | ~m1 idxs0 = bit0 | (bit1.to(torch.int64) << 1) idxs1 = bit2 | (bit3.to(torch.int64) << 1) if dense.dtype != torch.float: sparse0 = dense_4.gather( -1, idxs0.unsqueeze(-1) ) # type: ignore[possibly-undefined] sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1)) sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2) else: sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view( m, k // 2 ) # type: ignore[possibly-undefined] meta_4 = idxs0 | (idxs1 << 2) meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype) if quadbits_per_meta_elem == 4: meta = ( meta_n[:, :, 0] | (meta_n[:, :, 1] << 4) | (meta_n[:, :, 2] << 8) | (meta_n[:, :, 3] << 12) ) elif quadbits_per_meta_elem == 8: meta = ( meta_n[:, :, 0] | (meta_n[:, :, 1] << 4) | (meta_n[:, :, 2] << 8) | (meta_n[:, :, 3] << 12) | (meta_n[:, :, 4] << 16) | (meta_n[:, :, 5] << 20) | (meta_n[:, :, 6] << 24) | (meta_n[:, :, 7] << 28) ) # Reorder meta tensor elements. meta_reordered = meta.new_empty( (m * meta_ncols,) ) # type: ignore[possibly-undefined] meta_offsets = _calculate_meta_reordering_scatter_offsets( m, meta_ncols, meta_dtype, device ) meta_reordered.scatter_(0, meta_offsets, meta.view(-1)) return (sparse, meta_reordered.view(m, meta_ncols)) # This function performs reverse of the function above - it # reconstructs dense matrix from a pair of "compressed" matrix, given # in the layout used by CUTLASS backend, and accompanying metadata # matrix. def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered): if sparse.dim() != 2: raise RuntimeError( f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor" # noqa: E501 ) m, k = sparse.shape device = sparse.device if meta_reordered.dim() != 2: raise RuntimeError( f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor" # noqa: E501 ) if meta_reordered.device != device: raise RuntimeError( f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device" # noqa: E501 ) meta_dtype = meta_reordered.dtype if meta_dtype not in (torch.int16, torch.int32): raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix") quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4 ksparse = 4 if sparse.dtype != torch.float else 2 meta_nrows, meta_ncols = meta_reordered.shape if meta_nrows != m: raise RuntimeError( f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}" # noqa: E501 ) if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k: raise RuntimeError( f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, " # noqa: E501 "expected according to the number of columns of meta matrix" ) # Undo meta tensor elements reordering. meta_offsets = _calculate_meta_reordering_scatter_offsets( m, meta_ncols, meta_dtype, device ) meta = torch.gather(meta_reordered.view(-1), 0, meta_offsets).view(m, meta_ncols) # Unpack sparse tensor back to original dense tensor, using # information provided by meta tensor. Note that torch.float # datatype is handled pretty much the same as # torch.half/torch.bfloat16, as metadata for a pair of torch.float # value is encoded as if underlying 8 bytes contain four # torch.half/torch.bfloat16 values, where either first two or last # two are zeros. meta_2 = torch.empty( (m, meta_ncols, 2 * quadbits_per_meta_elem), dtype=meta_dtype, device=device, ) if quadbits_per_meta_elem == 4: meta_2[:, :, 0] = meta & 0b11 meta_2[:, :, 1] = (meta >> 2) & 0b11 meta_2[:, :, 2] = (meta >> 4) & 0b11 meta_2[:, :, 3] = (meta >> 6) & 0b11 meta_2[:, :, 4] = (meta >> 8) & 0b11 meta_2[:, :, 5] = (meta >> 10) & 0b11 meta_2[:, :, 6] = (meta >> 12) & 0b11 meta_2[:, :, 7] = (meta >> 14) & 0b11 elif quadbits_per_meta_elem == 8: meta_2[:, :, 0] = meta & 0b11 meta_2[:, :, 1] = (meta >> 2) & 0b11 meta_2[:, :, 2] = (meta >> 4) & 0b11 meta_2[:, :, 3] = (meta >> 6) & 0b11 meta_2[:, :, 4] = (meta >> 8) & 0b11 meta_2[:, :, 5] = (meta >> 10) & 0b11 meta_2[:, :, 6] = (meta >> 12) & 0b11 meta_2[:, :, 7] = (meta >> 14) & 0b11 meta_2[:, :, 8] = (meta >> 16) & 0b11 meta_2[:, :, 9] = (meta >> 18) & 0b11 meta_2[:, :, 10] = (meta >> 20) & 0b11 meta_2[:, :, 11] = (meta >> 22) & 0b11 meta_2[:, :, 12] = (meta >> 24) & 0b11 meta_2[:, :, 13] = (meta >> 26) & 0b11 meta_2[:, :, 14] = (meta >> 28) & 0b11 meta_2[:, :, 15] = (meta >> 30) & 0b11 dense_offsets = meta_2.view(-1) + ( torch.arange(0, 2 * m * k // ksparse, device=device) * 4 ).view(-1, 1).repeat(1, 2).view(-1) dense = torch.zeros((m * 2 * k,), dtype=sparse.dtype, device=device) if sparse.dtype != torch.float: # dense.scatter_(0, dense_offsets, sparse.view(-1)) dense.scatter_(0, dense_offsets, sparse.reshape(-1)) else: dense.view(torch.half).scatter_( 0, dense_offsets, sparse.view(torch.half).view(-1) ) return dense.view(m, 2 * k) def mask_creator(tensor): """ Class for creating N:M sparsity masks. Masks will be created using the N:M ratio, where for every block of M weights, N will be pruned based on ranked weight value. Each mask will correspond to the given tensor. :param N: The number of weights in a group to keep :param M: The size of a weight group """ N = 2 M = 4 mask = None # for i, tensor in enumerate(tensors): if tensor.numel() % M != 0: raise ValueError( f"Tensor of size {tensor.shape} can't be evenly divided into " f"{M} groups" ) num_groups = tensor.numel() // M # N:M sparsity for linear layers tensor_temp = tensor.detach().abs().reshape(num_groups, M) index = torch.argsort(tensor_temp, dim=1)[:, : int(M - N)] w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device) mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape) return mask compressed-tensors-0.9.4/tests/000077500000000000000000000000001500222531600165215ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/__init__.py000066400000000000000000000011511500222531600206300ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/conftest.py000066400000000000000000000120461500222531600207230ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from math import ceil import pytest import torch from compressed_tensors.quantization.utils import calculate_qparams from compressed_tensors.utils.offload import update_parameter_data def _get_dim(dim: int, value: torch.Tensor): if isinstance(dim, int): dim = [dim] dim = set(dim) reduce_dims = tuple(idx for idx in range(value.ndim) if idx not in dim) return reduce_dims @pytest.fixture def mock_per_token_calibration(): def update_scale_zp(module: torch.nn.Module, base_name: str, value: torch.Tensor): quantization_scheme = getattr(module, "quantization_scheme", None) if not quantization_scheme: # no quantization scheme nothing to do return arg_name = "weights" if base_name == "weight" else f"{base_name}_activations" args = getattr(quantization_scheme, arg_name, None) dim = _get_dim({0, 1}, value) min_val = torch.amin(value, dim=dim, keepdims=True) max_val = torch.amax(value, dim=dim, keepdims=True) scale, zp = calculate_qparams(min_val, max_val, args) update_parameter_data(module, scale, f"{base_name}_scale") update_parameter_data(module, zp, f"{base_name}_zero_point") return update_scale_zp @pytest.fixture def mock_per_group_calibration(): def update_scale_zp( module: torch.nn.Module, base_name: str, value: torch.Tensor, group_size: int ): quantization_scheme = getattr(module, "quantization_scheme", None) if not quantization_scheme: # no quantization scheme nothing to do return arg_name = "weights" if base_name == "weight" else f"{base_name}_activations" args = getattr(quantization_scheme, arg_name, None) rows = value.shape[0] columns = value.shape[1] num_groups = int(ceil(columns / group_size)) scale = torch.zeros((rows, num_groups), dtype=value.dtype, device=value.device) zp_dtype = args.pytorch_dtype() zp = torch.zeros((rows, num_groups), dtype=zp_dtype, device=value.device) group_sizes = torch.full((num_groups,), group_size, dtype=torch.int) end = 0 for group_index, group_count in enumerate(group_sizes): start = end end = start + group_count dim = _get_dim( 0, value[:, start:end], ) min_val = torch.amin(value, dim=dim, keepdims=True) max_val = torch.amax(value, dim=dim, keepdims=True) scale_out, zp_out = calculate_qparams(min_val, max_val, args) scale[:, group_index] = scale_out.squeeze(1) zp[:, group_index] = zp_out.squeeze(1) update_parameter_data(module, scale, f"{base_name}_scale") update_parameter_data(module, zp, f"{base_name}_zero_point") return update_scale_zp @pytest.fixture def mock_per_channel_calibration(): def update_scale_zp(module: torch.nn.Module, base_name: str, value: torch.Tensor): quantization_scheme = getattr(module, "quantization_scheme", None) if not quantization_scheme: # no quantization scheme nothing to do return arg_name = "weights" if base_name == "weight" else f"{base_name}_activations" args = getattr(quantization_scheme, arg_name, None) dim = _get_dim(0, value) min_val = torch.amin(value, dim=dim, keepdims=True) max_val = torch.amax(value, dim=dim, keepdims=True) scale, zp = calculate_qparams(min_val, max_val, args) update_parameter_data(module, scale, f"{base_name}_scale") update_parameter_data(module, zp, f"{base_name}_zero_point") return update_scale_zp @pytest.fixture def mock_per_tensor_calibration(): def update_scale_zp(module: torch.nn.Module, base_name: str, value: torch.Tensor): quantization_scheme = getattr(module, "quantization_scheme", None) if not quantization_scheme: # no quantization scheme nothing to do return arg_name = "weights" if base_name == "weight" else f"{base_name}_activations" args = getattr(quantization_scheme, arg_name, None) # per tensor quantization just calls calculate_qparams directly min_val, max_val = torch.aminmax(value) scale, zp = calculate_qparams(min_val, max_val, args) update_parameter_data(module, scale, f"{base_name}_scale") update_parameter_data(module, zp, f"{base_name}_zero_point") return update_scale_zp compressed-tensors-0.9.4/tests/test_compressors/000077500000000000000000000000001500222531600221375ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_compressors/__init__.py000066400000000000000000000011511500222531600242460ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_compressors/model_compressors/000077500000000000000000000000001500222531600256765ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_compressors/model_compressors/__init__.py000066400000000000000000000011511500222531600300050ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_compressors/model_compressors/test_model_compressor.py000066400000000000000000000301771500222531600326730ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json from copy import deepcopy from pathlib import Path import pytest import torch import torch.nn as nn from compressed_tensors.compressors import ModelCompressor from compressed_tensors.config import SparsityCompressionConfig from compressed_tensors.quantization import QuantizationConfig from safetensors.torch import save_file from tests.testing_utils import induce_sparsity, requires_hf_quantizer def sparsity_config(): return { "format": "sparse-bitmask", # dense format is ignored by ModelCompressor "global_sparsity": 19.098103233975568, "registry_requires_subclass": False, "sparsity_structure": "unstructured", } def quantization_config(): return { "config_groups": { "group_0": { "targets": ["Linear"], "weights": { "num_bits": 4, "strategy": "channel", "symmetric": True, "type": "int", }, } }, "format": "pack-quantized", "global_compression_ratio": 1.891791164021256, "ignore": ["lm_head"], "quant_method": "compressed-tensors", "quantization_status": "frozen", } @pytest.mark.parametrize("s_config", [sparsity_config(), None]) @pytest.mark.parametrize("q_config", [quantization_config(), None]) def test_config_format(s_config, q_config): combined_config = _get_combined_config(s_config, q_config) assert ModelCompressor.parse_sparsity_config(combined_config) == s_config assert ModelCompressor.parse_quantization_config(combined_config) == q_config @requires_hf_quantizer() @pytest.mark.parametrize( "s_config,q_config", [ (sparsity_config(), quantization_config()), (sparsity_config(), None), (None, quantization_config()), (None, None), ], ) def test_hf_compressor_tensors_config(s_config, q_config, tmp_path): from transformers.utils.quantization_config import CompressedTensorsConfig combined_config = _get_combined_config(s_config, q_config) compression_config = CompressedTensorsConfig(**combined_config) compressor = ModelCompressor.from_compression_config(compression_config) if s_config is q_config is None: assert compressor is None return s_config = ( SparsityCompressionConfig.load_from_registry(s_config.get("format"), **s_config) if s_config is not None else None ) q_config = QuantizationConfig(**q_config) if q_config is not None else None s_config_dict = s_config.model_dump() if s_config is not None else None q_config_dict = q_config.model_dump() if q_config is not None else None assert compressor.sparsity_config == s_config assert compressor.quantization_config == q_config assert ModelCompressor.parse_sparsity_config(compression_config) == s_config_dict assert ( ModelCompressor.parse_quantization_config(compression_config) == q_config_dict ) class DummyLinearModel(nn.Module): def __init__(self, weights, weight_scale=None, weight_zero_point=None): super(DummyLinearModel, self).__init__() out_features, in_features = weights.shape # Define a linear layer without bias self.linear = nn.Linear(in_features, out_features, bias=False) # Set the weights of the linear layer self.linear.weight = nn.Parameter(weights, requires_grad=False) # Attach weight_scale and weight_zero_point as parameters if weight_scale is not None: self.linear.weight_scale = nn.Parameter( torch.tensor(weight_scale), requires_grad=False ) if weight_zero_point is not None: self.linear.weight_zero_point = nn.Parameter( torch.tensor(weight_zero_point), requires_grad=False ) def forward(self, x): return self.linear(x) def get_bitmask_sparsity_config(targets=["Linear"]): from compressed_tensors import BitmaskConfig return BitmaskConfig( format="sparse-bitmask", global_sparsity=0.7, targets=targets, sparsity_structure="unstructured", ) def create_quantization_config(bits=8, type="int", strategy="tensor"): config_dict = { "format": "int-quantized", "global_compression_ratio": 1.0, "quant_method": "compressed-tensors", "config_groups": { "group_0": { "targets": ["Linear"], "weights": { "num_bits": bits, "strategy": strategy, "symmetric": True, "type": type, }, } }, } return QuantizationConfig.model_validate(config_dict) @pytest.mark.parametrize("sparsity_config", [get_bitmask_sparsity_config()]) @pytest.mark.parametrize( "quantization_config", [ create_quantization_config(bits=8, type="int", strategy="channel"), create_quantization_config(bits=8, type="float", strategy="channel"), ], ) def test_composability(tmp_path, sparsity_config, quantization_config): model_compressor = ModelCompressor( sparsity_config=sparsity_config, quantization_config=quantization_config ) fake_oneshot_model: DummyLinearModel = _get_fake_oneshot_sparse_quantized_model( sparsity_config=sparsity_config, quantization_config=quantization_config ) fake_oneshot_model = fake_oneshot_model.to(torch.float32) # does both sparse and quantization compression compressed_state_dict = model_compressor.compress(fake_oneshot_model) save_dir = tmp_path / "model" save_dir = _create_dummy_checkpoint( compressed_state_dict, save_dir, model_compressor ) decompressed_model = DummyLinearModel( torch.zeros_like(fake_oneshot_model.linear.weight) ) decompressed_model = decompressed_model.float() model_compressor.decompress(model=decompressed_model, model_path=save_dir) # check that the decompressed model is the same as the original model _check_state_dicts(fake_oneshot_model.state_dict(), decompressed_model.state_dict()) @pytest.mark.parametrize( "sparsity_config, quantization_config, missing, unexpected", [ ( get_bitmask_sparsity_config(), create_quantization_config(bits=8, type="int", strategy="channel"), {"linear.weight"}, { "linear.bitmask", "linear.compressed", "linear.row_offsets", "linear.shape", "linear.weight_scale", }, ) ], ) def test_missing_and_unexpected_keys_on_compression( tmp_path, sparsity_config, quantization_config, missing, unexpected ): model_compressor = ModelCompressor( sparsity_config=sparsity_config, quantization_config=quantization_config ) fake_oneshot_model: DummyLinearModel = _get_fake_oneshot_sparse_quantized_model( sparsity_config=sparsity_config, quantization_config=quantization_config ) og_state_dict_keys = set( DummyLinearModel(weights=torch.randn(10, 5)).state_dict().keys() ) compressed_state_dict_keys = set( model_compressor.compress(fake_oneshot_model).keys() ) assert og_state_dict_keys - compressed_state_dict_keys == missing assert compressed_state_dict_keys - og_state_dict_keys == unexpected class TwoLayerModel(nn.Module): def __init__(self): super(TwoLayerModel, self).__init__() self.layer1 = nn.Linear(10, 10, bias=False) self.layer2 = nn.Linear(10, 10, bias=False) def forward(self, x): x = self.layer1(x) x = self.layer2(x) return x @pytest.mark.parametrize( "model, sparsity_config, quantization_config, expected", [ ( TwoLayerModel(), get_bitmask_sparsity_config(targets=["re:.*layer1$"]), create_quantization_config(bits=8, type="int", strategy="channel"), {"layer1.weight"}, ) ], ) def test_get_missing_keys(model, sparsity_config, quantization_config, expected): model_compressor = ModelCompressor( sparsity_config=sparsity_config, quantization_config=quantization_config ) actual = model_compressor.get_missing_module_keys(model) assert len(actual) == len(expected) and all(key in actual for key in expected) @pytest.mark.parametrize( "model, sparsity_config, quantization_config, expected", [ ( TwoLayerModel(), get_bitmask_sparsity_config(targets=["re:.*layer1$"]), create_quantization_config(bits=8, type="int", strategy="channel"), { f"{layer}.{suffix}" for layer, suffixes in { "layer1": [ "shape", "row_offsets", "weight_zero_point", "weight_g_idx", "bitmask", "weight_scale", "compressed", ], "layer2": ["weight_scale", "weight_zero_point", "weight_g_idx"], }.items() for suffix in suffixes }, ) ], ) def test_get_unexpected_keys(model, sparsity_config, quantization_config, expected): model_compressor = ModelCompressor( sparsity_config=sparsity_config, quantization_config=quantization_config ) actual = model_compressor.get_unexpected_file_keys(model) assert len(actual) == len(expected) and all(key in actual for key in expected) def _create_dummy_checkpoint(state_dict, save_dir, model_compressor): save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) save_file(state_dict, save_dir / "model.safetensors") config_file_path = save_dir / "config.json" with open(config_file_path, "w") as config_file: json.dump({}, config_file, indent=2, sort_keys=True) model_compressor.update_config(save_dir) return save_dir def _check_state_dicts(state_dict1, state_dict2): for key in state_dict1.keys(): assert key in state_dict2, f"Missing tensor: {key}" if key.endswith("weight"): original_tensor = state_dict1[key] decompressed_tensor = state_dict2[key].to(original_tensor.dtype) diff = torch.abs(original_tensor - decompressed_tensor) assert not torch.any(diff > 0.01), f"Max diff: {torch.max(diff)}" def _get_fake_oneshot_sparse_quantized_model(quantization_config, sparsity_config): from compressed_tensors.quantization.lifecycle.forward import quantize weights = torch.rand(10, 5) sparse_weights = induce_sparsity(weights, sparsity_config.global_sparsity) quantization_args = quantization_config.config_groups["group_0"].weights if quantization_args.strategy == "channel": scale = torch.ones((weights.shape[0], 1)) elif quantization_args.strategy == "tensor": scale = torch.tensor([1.0]) zero_point = torch.zeros_like(scale) quantized_weights = quantize( sparse_weights, scale=scale, zero_point=zero_point, args=quantization_args, ) fake_oneshot_model = DummyLinearModel(quantized_weights, scale, zero_point) fake_oneshot_model.linear.quantization_scheme = quantization_config.config_groups[ "group_0" ] return fake_oneshot_model def _get_combined_config(s_config, q_config): combined = {} if q_config is not None: combined = deepcopy(q_config) if s_config is not None: combined["sparsity_config"] = s_config return combined compressed-tensors-0.9.4/tests/test_compressors/quantized_compressors/000077500000000000000000000000001500222531600266025ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_compressors/quantized_compressors/__init__.py000066400000000000000000000011511500222531600307110ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_compressors/quantized_compressors/test_fp8_quant.py000066400000000000000000000126331500222531600321250ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil from collections import OrderedDict import pytest import torch from compressed_tensors import FloatQuantizationCompressor from compressed_tensors.quantization import ( QuantizationArgs, QuantizationConfig, QuantizationScheme, QuantizationStatus, QuantizationStrategy, apply_quantization_config, ) from compressed_tensors.quantization.lifecycle.forward import fake_quantize from safetensors.torch import save_file from torch.nn.modules import Linear, Sequential def get_dummy_quant_config(strategy, group_size=None): config_groups = { "group_1": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs( strategy=strategy, type="float", group_size=group_size ), ), } ignore = ["lm_head"] quant_config = QuantizationConfig( config_groups=config_groups, ignore=ignore, ) return quant_config def make_dummy_g_idx(columns: int, group_size: int) -> torch.Tensor: perm = torch.randperm(columns) return torch.tensor([index // group_size for index in range(columns)])[perm] @pytest.mark.parametrize( "strategy,group_size,sc,zp", [ [QuantizationStrategy.TENSOR, None, 0.01, 0], [ QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1), dtype=torch.int8), ], [ QuantizationStrategy.CHANNEL, None, torch.rand((512, 1)) * 0.01, torch.zeros((512, 1), dtype=torch.int8), ], ], ) def test_quant_format(strategy, group_size, sc, zp): dense_state_dict = { "dummy.weight": torch.rand((512, 1024)), "dummy.weight_scale": torch.tensor(sc, dtype=torch.float32), "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.float32), } if group_size is not None: dense_state_dict["dummy.weight_g_idx"] = make_dummy_g_idx(512, group_size) quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size) compressor = FloatQuantizationCompressor(config=quant_config) quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights} compressed_state_dict = compressor.compress( dense_state_dict, names_to_scheme=quantized_modules_to_args ) # state_dict params should be the same, minus the zero_point if symmetric assert len(dense_state_dict) == len(compressed_state_dict) + 1 # check compressed to int8 assert compressed_state_dict["dummy.weight_scale"].dtype == torch.float32 assert torch.equal( compressed_state_dict["dummy.weight_scale"], dense_state_dict["dummy.weight_scale"], ) if group_size is not None: assert torch.equal( compressed_state_dict["dummy.weight_g_idx"], dense_state_dict["dummy.weight_g_idx"], ) @pytest.mark.parametrize( "strategy,group_size", [ [QuantizationStrategy.TENSOR, None], [QuantizationStrategy.CHANNEL, None], # Note that group quantization is not supported ], ) def test_reload_match( mock_per_group_calibration, mock_per_channel_calibration, strategy, group_size, tmp_path, ): model = Sequential( OrderedDict( [ ("dummy", Linear(512, 1024, bias=None)), ] ) ) quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size) apply_quantization_config(model, quant_config) model.dummy.quantization_status = QuantizationStatus.CALIBRATION if strategy == QuantizationStrategy.GROUP: mock_per_group_calibration( model.dummy, base_name="weight", value=model.dummy.weight, group_size=128 ) if strategy == QuantizationStrategy.CHANNEL: mock_per_channel_calibration( model.dummy, base_name="weight", value=model.dummy.weight ) compressor = FloatQuantizationCompressor(config=quant_config) quantized_modules_to_args = { "dummy": quant_config.config_groups["group_1"].weights, } compressed_state_dict = compressor.compress( model.state_dict(), names_to_scheme=quantized_modules_to_args ) save_file(compressed_state_dict, tmp_path / "model.safetensors") reconstructed_dense_gen = compressor.decompress( tmp_path, names_to_scheme=quantized_modules_to_args ) reconstructed_dense = {} for name, value in reconstructed_dense_gen: reconstructed_dense[name] = value fake_quant_dummy = fake_quantize( model.dummy.weight, scale=model.dummy.weight_scale, zero_point=model.dummy.weight_zero_point, args=quantized_modules_to_args["dummy"], ) assert torch.equal(fake_quant_dummy, reconstructed_dense["dummy"].get("weight")) shutil.rmtree(tmp_path) compressed-tensors-0.9.4/tests/test_compressors/quantized_compressors/test_int_quant.py000066400000000000000000000130661500222531600322230ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import shutil import pytest import torch from compressed_tensors import IntQuantizationCompressor from compressed_tensors.quantization import ( QuantizationArgs, QuantizationConfig, QuantizationScheme, QuantizationStrategy, ) from compressed_tensors.quantization.lifecycle.forward import fake_quantize from safetensors.torch import save_file def get_dummy_quant_config(strategy, group_size=None, symmetric=True): config_groups = { "group_1": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs( strategy=strategy, group_size=group_size, symmetric=symmetric ), ), } ignore = ["lm_head"] quant_config = QuantizationConfig( config_groups=config_groups, ignore=ignore, ) return quant_config @pytest.mark.parametrize( "strategy,symmetric,group_size,sc,zp", [ [QuantizationStrategy.TENSOR, True, None, 0.01, 0], [ QuantizationStrategy.GROUP, True, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1), dtype=torch.int8), ], [ QuantizationStrategy.CHANNEL, False, None, torch.rand((512, 1)) * 0.01, ((torch.rand((512, 1)) - 0.5) * 127).to(torch.int8), ], ], ) def test_quant_format(strategy, symmetric, group_size, sc, zp): dense_state_dict = { "dummy.weight": torch.rand((512, 1024)), "dummy.weight_scale": torch.tensor(sc, dtype=torch.float32), "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.int32), } quant_config = get_dummy_quant_config( strategy=strategy, group_size=group_size, symmetric=symmetric ) compressor = IntQuantizationCompressor(config=quant_config) quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights} compressed_state_dict = compressor.compress( dense_state_dict, names_to_scheme=quantized_modules_to_args ) # state_dict params should be the same, minus the zero_point if symmetric if symmetric: assert len(dense_state_dict) == len(compressed_state_dict) + 1 else: assert len(dense_state_dict) == len(compressed_state_dict) # check compressed to int8 assert compressed_state_dict["dummy.weight"].dtype == torch.int8 assert compressed_state_dict["dummy.weight_scale"].dtype == torch.float32 if not symmetric: assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32 @pytest.mark.parametrize( "strategy,group_size,sc,zp", [ [QuantizationStrategy.TENSOR, None, 0.01, 0], [ QuantizationStrategy.GROUP, 128, torch.rand((300, 8)) * 0.01, torch.zeros((300, 8), dtype=torch.int8), ], [ QuantizationStrategy.CHANNEL, None, torch.rand((300, 1)) * 0.01, torch.zeros((300, 1), dtype=torch.int8), ], ], ) def test_reload_match(strategy, group_size, sc, zp, tmp_path): dense_state_dict = { "dummy.weight": torch.rand((300, 1024)), "dummy.weight_scale": torch.tensor(sc, dtype=torch.float32), "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.int32), "dummy2.weight": torch.rand((300, 1024)), "dummy2.weight_scale": torch.tensor(sc, dtype=torch.float32), "dummy2.weight_zero_point": torch.tensor(zp, dtype=torch.int32), } quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size) compressor = IntQuantizationCompressor(config=quant_config) quantized_modules_to_args = { "dummy": quant_config.config_groups["group_1"].weights, "dummy2": quant_config.config_groups["group_1"].weights, } compressed_state_dict = compressor.compress( dense_state_dict, names_to_scheme=quantized_modules_to_args ) save_file(compressed_state_dict, tmp_path / "model.safetensors") reconstructed_dense_gen = compressor.decompress( tmp_path, names_to_scheme=quantized_modules_to_args ) reconstructed_dense = {} for name, value in reconstructed_dense_gen: reconstructed_dense[name] = value fake_quant_dummy = fake_quantize( dense_state_dict["dummy.weight"], scale=dense_state_dict["dummy.weight_scale"], zero_point=dense_state_dict["dummy.weight_zero_point"], args=quantized_modules_to_args["dummy"], ) assert torch.equal( fake_quant_dummy, reconstructed_dense["dummy"].get("weight").to(torch.float32) ) fake_quant_dummy2 = fake_quantize( dense_state_dict["dummy2.weight"], scale=dense_state_dict["dummy2.weight_scale"], zero_point=dense_state_dict["dummy2.weight_zero_point"], args=quantized_modules_to_args["dummy2"], ) assert torch.equal( fake_quant_dummy2, reconstructed_dense["dummy2"].get("weight").to(torch.float32) ) shutil.rmtree(tmp_path) compressed-tensors-0.9.4/tests/test_compressors/quantized_compressors/test_pack_quant.py000066400000000000000000000372471500222531600323560ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import shutil from collections import OrderedDict import pytest import torch from compressed_tensors import PackedQuantizationCompressor from compressed_tensors.compressors.quantized_compressors.pack_quantized import ( pack_to_int32, unpack_from_int32, ) from compressed_tensors.quantization import ( QuantizationArgs, QuantizationConfig, QuantizationScheme, QuantizationStatus, QuantizationStrategy, apply_quantization_config, ) from compressed_tensors.quantization.lifecycle.forward import fake_quantize from compressed_tensors.quantization.quant_args import ActivationOrdering from safetensors.torch import save_file from torch.nn.modules import Linear, Sequential def get_dummy_quant_config( num_bits=4, strategy=None, group_size=None, actorder=None, symmetric=True ): config_groups = { "group_1": QuantizationScheme( targets=["Linear"], weights=QuantizationArgs( num_bits=num_bits, strategy=strategy, group_size=group_size, actorder=actorder, symmetric=symmetric, ), ), } return QuantizationConfig(config_groups=config_groups) def make_dummy_g_idx(columns: int, group_size: int) -> torch.Tensor: perm = torch.randperm(columns) return torch.nn.Parameter( (torch.arange(columns, dtype=torch.int) // group_size)[perm], requires_grad=False, ) @pytest.mark.parametrize( "shape", [ (512, 1024), (830, 545), (342, 512), (256, 700), ], ) def test_quant_format(shape): dense_state_dict = { "dummy.weight": torch.rand(shape), "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32), "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int8), } quant_config = get_dummy_quant_config() compressor = PackedQuantizationCompressor(config=quant_config) quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights} compressed_state_dict = compressor.compress( dense_state_dict, names_to_scheme=quantized_modules_to_args ) # compressed state_dict adds one entry for shape # but removes the zero points since we are symmetric assert len(dense_state_dict) == len(compressed_state_dict) # check compressed and packed assert compressed_state_dict["dummy.weight_packed"].dtype == torch.int32 expected_rows = shape[0] expected_columns = math.ceil(shape[1] / 8) # round each row up to nearest int32 assert compressed_state_dict["dummy.weight_packed"].shape == ( expected_rows, expected_columns, ) assert torch.equal(compressed_state_dict["dummy.weight_shape"], torch.tensor(shape)) assert compressed_state_dict["dummy.weight_scale"].dtype == torch.float32 @pytest.mark.parametrize( "value", [ torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 2, 3, 4, 5, 6, 7, 0], [-1, -2, -3, -4, -5, -6, -7, -8]]), (torch.rand((32, 100)) * 16 - 8), ], ) def test_repack_4bit(value): value = value.to(torch.int8) shape = value.shape assert not torch.any(value > 7).item() assert not torch.any(value < -8).item() packed = pack_to_int32(value, 4) unpacked = unpack_from_int32(packed, 4, shape) assert torch.equal(value, unpacked) @pytest.mark.parametrize( "value", [ torch.tensor([[30, 40], [50, 60]]), torch.tensor( [[10, 15, 20, 25, 30, 35, 40, 45], [-10, -20, -30, -40, -50, -60, -70, -80]] ), (torch.rand((32, 100)) * 256 - 128), ], ) def test_repack_8bit(value): value = value.to(torch.int8) shape = value.shape assert not torch.any(value > 127).item() assert not torch.any(value < -128).item() packed = pack_to_int32(value, 8) unpacked = unpack_from_int32(packed, 8, shape) assert torch.equal(value, unpacked) @pytest.mark.parametrize("num_bits", [4, 8]) def test_reload_match(tmp_path, num_bits): dense_state_dict = { "dummy.weight": torch.rand((511, 350)), "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32), "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int8), "dummy2.weight": torch.rand((128, 280)), "dummy2.weight_scale": torch.tensor(0.02, dtype=torch.float32), "dummy2.weight_zero_point": torch.tensor(15, dtype=torch.int8), } # pack-compressor only needs the number of bits from the quant-args to decompress # all other information is extracted from the compressed data directly names_to_scheme = { "dummy": QuantizationArgs(num_bits=num_bits), "dummy2": QuantizationArgs(num_bits=num_bits), } quant_config = get_dummy_quant_config(num_bits, symmetric=False) compressor = PackedQuantizationCompressor(config=quant_config) quantized_modules_to_args = { "dummy": quant_config.config_groups["group_1"].weights, "dummy2": quant_config.config_groups["group_1"].weights, } compressed_state_dict = compressor.compress( dense_state_dict, names_to_scheme=quantized_modules_to_args ) save_file(compressed_state_dict, tmp_path / "model.safetensors") reconstructed_dense_gen = compressor.decompress( tmp_path, names_to_scheme=names_to_scheme ) reconstructed_dense = {} for name, value in reconstructed_dense_gen: reconstructed_dense[name] = value fake_quant_dummy = fake_quantize( dense_state_dict["dummy.weight"], scale=dense_state_dict["dummy.weight_scale"], zero_point=dense_state_dict["dummy.weight_zero_point"], args=quantized_modules_to_args["dummy"], ) assert torch.equal( fake_quant_dummy, reconstructed_dense["dummy"].get("weight").to(torch.float32) ) fake_quant_dummy2 = fake_quantize( dense_state_dict["dummy2.weight"], scale=dense_state_dict["dummy2.weight_scale"], zero_point=dense_state_dict["dummy2.weight_zero_point"], args=quantized_modules_to_args["dummy2"], ) assert torch.equal( fake_quant_dummy2, reconstructed_dense["dummy2"].get("weight").to(torch.float32) ) shutil.rmtree(tmp_path) @pytest.mark.parametrize( "strategy", {QuantizationStrategy.GROUP, QuantizationStrategy.CHANNEL}, ) def test_asymmetric_packed_support(strategy): shape = (1024, 1024) group_size = None if strategy == QuantizationStrategy.GROUP: group_size = 128 if strategy == QuantizationStrategy.CHANNEL: expected_shape = (shape[0], 1) elif strategy == QuantizationStrategy.GROUP: num_groups = shape[1] // group_size expected_shape = (shape[0], max(num_groups, 1)) dense_state_dict = { "dummy.weight": torch.rand(shape), "dummy.weight_scale": torch.rand(expected_shape).to(torch.float32), "dummy.weight_zero_point": torch.rand(expected_shape).to(torch.int8), } quant_config = get_dummy_quant_config( strategy=strategy.value, symmetric=False, group_size=group_size ) compressor = PackedQuantizationCompressor(config=quant_config) quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights} compressed_state_dict = compressor.compress( dense_state_dict, names_to_scheme=quantized_modules_to_args ) # compressed state_dict adds one entry for shape assert len(dense_state_dict) + 1 == len(compressed_state_dict) assert compressed_state_dict["dummy.weight_packed"].dtype == torch.int32 assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32 assert compressed_state_dict["dummy.weight_scale"].dtype == torch.float32 # check weight compressed and packed expected_rows = shape[0] expected_columns = math.ceil(shape[1] / 8) # round each row up to nearest int32 assert compressed_state_dict["dummy.weight_packed"].shape == ( expected_rows, expected_columns, ) assert torch.equal(compressed_state_dict["dummy.weight_shape"], torch.tensor(shape)) # check zp compressed and packed packed_size_zp = math.ceil(shape[0] / 8) zp_factor = group_size if strategy == QuantizationStrategy.GROUP else shape[-1] assert compressed_state_dict["dummy.weight_zero_point"].shape == ( packed_size_zp, shape[-1] // zp_factor, ) @pytest.mark.parametrize( "actorder", [ ActivationOrdering.GROUP, ActivationOrdering.WEIGHT, None, ], ) def test_actorder_reload_match(actorder, tmp_path, mock_per_group_calibration): model = Sequential(OrderedDict([("dummy", Linear(512, 1024, bias=None))])) group_size = 128 quant_config = get_dummy_quant_config( strategy="group", group_size=group_size, actorder=actorder ) apply_quantization_config(model, quant_config) # run calibration model.quantization_status = QuantizationStatus.CALIBRATION mock_per_group_calibration( model.dummy, base_name="weight", value=model.dummy.weight, group_size=group_size ) # apply gptq if actorder == ActivationOrdering.GROUP: init_g_idx = make_dummy_g_idx(512, group_size) model.dummy.register_parameter("weight_g_idx", init_g_idx) # compress compressor = PackedQuantizationCompressor(config=quant_config) quantized_modules_to_args = { "dummy": quant_config.config_groups["group_1"].weights, } compressed_state_dict = compressor.compress( model.state_dict(), names_to_scheme=quantized_modules_to_args ) save_file(compressed_state_dict, tmp_path / "model.safetensors") # decompress reconstructed_dense_gen = compressor.decompress( tmp_path, names_to_scheme=quantized_modules_to_args ) reconstructed_dense = {} for name, value in reconstructed_dense_gen: reconstructed_dense[name] = value fake_quant_dummy = fake_quantize( model.dummy.weight, scale=model.dummy.weight_scale, zero_point=model.dummy.weight_zero_point, g_idx=getattr(model.dummy, "weight_g_idx", None), args=quantized_modules_to_args["dummy"], ) assert torch.equal(fake_quant_dummy, reconstructed_dense["dummy"].get("weight")) shutil.rmtree(tmp_path) @pytest.mark.parametrize( "num_bits,values,expected_values", [ ( 4, torch.tensor([[1]]), torch.tensor([[9]], dtype=torch.int32), ), ( 8, torch.tensor([[1]]), torch.tensor([[129]], dtype=torch.int32), ), # 0000 0000 0000 0000 1100 1011 1010 1001 (4, torch.tensor([[1, 2, 3, 4]]), torch.tensor([[52137]], dtype=torch.int32)), # 0111 0110 0101 0100 0011 0010 0001 0000 ( 4, torch.tensor([[-8, -7, -6, -5, -4, -3, -2, -1]]), torch.tensor([[1985229328]], dtype=torch.int32), ), # 10000100 10000011 10000010 10000001 ( 8, torch.tensor([[1, 2, 3, 4]]), torch.tensor([[-2071756159]], dtype=torch.int32), ), # 00000011 00000010 00000001 00000000 ( 8, torch.tensor([[-128, -127, -126, -125]]), torch.tensor([[50462976]], dtype=torch.int32), ), ( 4, torch.tensor([[-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4]]), torch.tensor([[1985229328, 52137]], dtype=torch.int32), ), ( 4, torch.tensor( [ [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, -8, -8, -8, -8], [1, 2, 3, 4, -8, -8, -8, -8, -8, -7, -6, -5, -4, -3, -2, -1], ] ), torch.tensor([[1985229328, 52137], [52137, 1985229328]], dtype=torch.int32), ), ( 8, torch.tensor( [ [1, 2, 3, 4], [-128, -127, -126, -125], ] ), torch.tensor([[-2071756159], [50462976]], dtype=torch.int32), ), ( 8, torch.tensor( [ [1, 2, 3, 4, -128, -127, -126, -125], [-128, -127, -126, -125, 1, 2, 3, 4], ] ), torch.tensor( [[-2071756159, 50462976], [50462976, -2071756159]], dtype=torch.int32 ), ), ], ) def test_pack_to_int32(num_bits, values, expected_values): values = values.to(torch.int8) packed_values = pack_to_int32(values, num_bits) assert torch.equal(packed_values, expected_values) assert packed_values.dtype == expected_values.dtype @pytest.mark.parametrize( "num_bits,values,expected_tensor", [ ( 4, torch.tensor([[9]], dtype=torch.int32), torch.tensor([[1]], dtype=torch.int8), ), ( 8, torch.tensor([[129]], dtype=torch.int32), torch.tensor([[1]], dtype=torch.int8), ), ( 4, torch.tensor([[52137]], dtype=torch.int32), torch.tensor([[1, 2, 3, 4]], dtype=torch.int8), ), ( 4, torch.tensor([[1985229328]], dtype=torch.int32), torch.tensor([[-8, -7, -6, -5, -4, -3, -2, -1]], dtype=torch.int8), ), ( 8, torch.tensor([[-2071756159]], dtype=torch.int32), torch.tensor([[1, 2, 3, 4]], dtype=torch.int8), ), ( 8, torch.tensor([[50462976]], dtype=torch.int32), torch.tensor([[-128, -127, -126, -125]], dtype=torch.int8), ), ( 4, torch.tensor([[1985229328, 52137]], dtype=torch.int32), torch.tensor( [[-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4]], dtype=torch.int8 ), ), ( 4, torch.tensor([[1985229328, 52137], [52137, 1985229328]], dtype=torch.int32), torch.tensor( [ [-8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, -8, -8, -8, -8], [1, 2, 3, 4, -8, -8, -8, -8, -8, -7, -6, -5, -4, -3, -2, -1], ], dtype=torch.int8, ), ), ( 8, torch.tensor([[-2071756159], [50462976]], dtype=torch.int32), torch.tensor( [ [1, 2, 3, 4], [-128, -127, -126, -125], ], dtype=torch.int8, ), ), ( 8, torch.tensor( [[-2071756159, 50462976], [50462976, -2071756159]], dtype=torch.int32 ), torch.tensor( [ [1, 2, 3, 4, -128, -127, -126, -125], [-128, -127, -126, -125, 1, 2, 3, 4], ], dtype=torch.int8, ), ), ], ) def test_unpack_from_int32(num_bits, values, expected_tensor): unpacked_tensor = unpack_from_int32(values, num_bits, expected_tensor.shape) assert torch.equal(unpacked_tensor, unpacked_tensor) assert unpacked_tensor.dtype == unpacked_tensor.dtype compressed-tensors-0.9.4/tests/test_compressors/sparse_compressors/000077500000000000000000000000001500222531600260735ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_compressors/sparse_compressors/__init__.py000066400000000000000000000011511500222531600302020ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_compressors/sparse_compressors/test_bitmask.py000066400000000000000000000103061500222531600311360ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import shutil import pytest import torch from compressed_tensors import BitmaskCompressor, BitmaskConfig, BitmaskTensor from safetensors.torch import save_file @pytest.mark.parametrize( "shape,sparsity,dtype", [ [(512, 1024), 0.5, torch.float32], [(830, 545), 0.8, torch.float32], [(342, 512), 0.3, torch.bfloat16], [(256, 700), 0.9, torch.float16], ], ) def test_bitmask_sizes(shape, sparsity, dtype): test_tensor = torch.rand(shape, dtype=dtype) mask = (test_tensor.abs() < (1 - sparsity)).int() test_tensor *= mask dense_state_dict = {"dummy.weight": test_tensor} sparsity_config = BitmaskConfig() compressor = BitmaskCompressor(config=sparsity_config) sparse_state_dict = compressor.compress(dense_state_dict) # each dense tensor has 4 parameters for compression assert len(dense_state_dict) * 4 == len(sparse_state_dict) # bitmask should be 1 bit per dense element, rounded up to nearest int8 sparse_shape = sparse_state_dict["dummy.shape"] assert torch.all(torch.eq(sparse_shape, torch.tensor(shape))) bitmask_shape = sparse_state_dict["dummy.bitmask"].shape assert bitmask_shape[0] == sparse_shape[0] assert bitmask_shape[1] == int(math.ceil(sparse_shape[1] / 8.0)) # one value for each non-zero weight values_shape = sparse_state_dict["dummy.compressed"].shape assert values_shape[0] == torch.sum(test_tensor != 0) row_offsets_shape = sparse_state_dict["dummy.row_offsets"].shape assert row_offsets_shape[0] == test_tensor.shape[0] @pytest.mark.parametrize( "shape,sparsity,dtype", [ [(256, 512), 0.5, torch.float32], [(128, 280), 0.8, torch.float32], [(1024, 256), 0.3, torch.bfloat16], [(511, 350), 0.7, torch.float16], ], ) def test_match(shape, sparsity, dtype): test_tensor1 = torch.rand(shape, dtype=dtype) mask = (test_tensor1.abs() < (1 - sparsity)).int() test_tensor1 *= mask test_tensor2 = torch.rand(shape, dtype=dtype) mask = (test_tensor2.abs() < (1 - sparsity)).int() test_tensor2 *= mask dense_state_dict = {"dummy.weight": test_tensor1, "dummy2.weight": test_tensor2} for key in dense_state_dict.keys(): dense_tensor = dense_state_dict[key] sparse_tensor = BitmaskTensor.from_dense(dense_tensor) decompressed = sparse_tensor.decompress() assert decompressed.dtype == dense_tensor.dtype == dtype assert torch.equal(dense_tensor, decompressed) @pytest.mark.parametrize( "sparsity,dtype", [ [0.5, torch.float32], [0.8, torch.float32], [0.3, torch.bfloat16], [0.7, torch.float16], ], ) def test_reload_match(sparsity, dtype, tmp_path): test_tensor1 = torch.rand((256, 512), dtype=dtype) mask = (test_tensor1.abs() < (1 - sparsity)).int() test_tensor1 *= mask test_tensor2 = torch.rand((360, 720), dtype=dtype) mask = (test_tensor2.abs() < (1 - sparsity)).int() test_tensor2 *= mask dense_state_dict = {"dummy.weight": test_tensor1, "dummy2.weight": test_tensor2} sparsity_config = BitmaskConfig() compressor = BitmaskCompressor(config=sparsity_config) sparse_state_dict = compressor.compress(dense_state_dict) save_file(sparse_state_dict, tmp_path / "model.safetensors") reconstructed_dense = compressor.decompress(tmp_path) for key, reconstructed_tensor in reconstructed_dense: dense_tensor = dense_state_dict[key] assert dense_tensor.dtype == reconstructed_tensor.dtype == dtype assert torch.equal(dense_tensor, reconstructed_tensor) shutil.rmtree(tmp_path) compressed-tensors-0.9.4/tests/test_compressors/sparse_compressors/test_sparse_24_bitmask.py000066400000000000000000000143241500222531600330240ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors import Sparse24BitMaskTensor from compressed_tensors.quantization import FP8_DTYPE from compressed_tensors.utils import combine_shards, shard_tensor from tests.testing_utils import generate_pruned_semi_structured_mat, requires_gpu @pytest.fixture def dense_matrix_fixture(): def _generate_dense_matrix(M, K, dtype): return generate_pruned_semi_structured_mat(M, K, dtype) return _generate_dense_matrix @pytest.fixture def shard_validation(): def _validate_shard_shapes(sharded_values, sharded_bitmask, expected_shapes): for shard_values, shard_bitmask, expected_shape in zip( sharded_values, sharded_bitmask, expected_shapes ): assert ( shard_values.shape == expected_shape["compressed"] ), f"Shape mismatch: {shard_values.shape} != {expected_shape['compressed']}" assert ( shard_bitmask.shape == expected_shape["bitmask"] ), f"Shape mismatch: {shard_bitmask.shape} != {expected_shape['bitmask']}" return _validate_shard_shapes def validate_compression(dense_matrix, decompressed_tensor): """Validate that the decompressed tensor matches the original dense matrix.""" dense_matrix = dense_matrix.to(decompressed_tensor.device) assert dense_matrix.dtype == decompressed_tensor.dtype, "Dtype mismatch" assert dense_matrix.shape == decompressed_tensor.shape, "Shape mismatch" assert torch.equal(dense_matrix, decompressed_tensor), "Decompression failed" @pytest.mark.parametrize("dtype", [torch.int8]) def test_bitmask_compress_decompress(dense_matrix_fixture, dtype): M, K = 1024, 1024 dense_matrix = dense_matrix_fixture(M, K, dtype) bitmask_tensor = Sparse24BitMaskTensor.from_dense( dense_matrix, sparsity_structure="2:4" ) decompressed_tensor = bitmask_tensor.decompress() validate_compression(dense_matrix, decompressed_tensor) @pytest.mark.parametrize( "dtype, M, K, shard_sizes, shard_dim, expected_shapes", [ ( torch.int8, 2560, 2048, [2048, 256, 256], 0, [ {"compressed": (2048, 1024), "bitmask": (2048, 2048 // 8)}, {"compressed": (256, 1024), "bitmask": (256, 2048 // 8)}, {"compressed": (256, 1024), "bitmask": (256, 2048 // 8)}, ], ), ( torch.int8, 2048, 2048, [1024, 1024], 1, [ {"compressed": (2048, 512), "bitmask": (2048, 2048 // 8 // 2)}, {"compressed": (2048, 512), "bitmask": (2048, 2048 // 8 // 2)}, ], ), ], ) def test_bitmask_compress_decompress_sharded( dense_matrix_fixture, shard_validation, dtype, M, K, shard_sizes, shard_dim, expected_shapes, ): dense_matrix = dense_matrix_fixture(M, K, dtype) bitmask_tensor = Sparse24BitMaskTensor.from_dense(dense_matrix) compressed_values = bitmask_tensor.compressed compressed_bitmask = bitmask_tensor.bitmask if shard_dim == 1: compressed_shard_sizes = [size // 2 for size in shard_sizes] bitmask_shard_sizes = [size // 8 for size in shard_sizes] else: compressed_shard_sizes = shard_sizes bitmask_shard_sizes = shard_sizes sharded_compressed_values = shard_tensor( compressed_values, compressed_shard_sizes, dim=shard_dim ) sharded_compressed_bitmask = shard_tensor( compressed_bitmask, bitmask_shard_sizes, dim=shard_dim ) shard_validation( sharded_compressed_values, sharded_compressed_bitmask, expected_shapes ) decompressed_shards = [ Sparse24BitMaskTensor( shape=(expected_shape["bitmask"][0], expected_shape["bitmask"][1] * 8), compressed=shard_values, bitmask=shard_bitmask, ).decompress() for shard_values, shard_bitmask, expected_shape in zip( sharded_compressed_values, sharded_compressed_bitmask, expected_shapes ) ] decompressed_combined = combine_shards(decompressed_shards, dim=shard_dim) validate_compression(dense_matrix, decompressed_combined) # GPU-Specific Tests for FP8_DTYPE @pytest.mark.parametrize("dtype", [FP8_DTYPE]) @requires_gpu def test_bitmask_compress_decompress_fp8(dense_matrix_fixture, dtype): test_bitmask_compress_decompress(dense_matrix_fixture, dtype) @pytest.mark.parametrize( "dtype, M, K, shard_sizes, shard_dim, expected_shapes", [ ( FP8_DTYPE, 2560, 2048, [2048, 256, 256], 0, [ {"compressed": (2048, 1024), "bitmask": (2048, 2048 // 8)}, {"compressed": (256, 1024), "bitmask": (256, 2048 // 8)}, {"compressed": (256, 1024), "bitmask": (256, 2048 // 8)}, ], ), ( FP8_DTYPE, 2048, 2048, [1024, 1024], 1, [ {"compressed": (2048, 512), "bitmask": (2048, 2048 // 8 // 2)}, {"compressed": (2048, 512), "bitmask": (2048, 2048 // 8 // 2)}, ], ), ], ) @requires_gpu def test_bitmask_compress_decompress_sharded_fp8( dense_matrix_fixture, shard_validation, dtype, M, K, shard_sizes, shard_dim, expected_shapes, ): test_bitmask_compress_decompress_sharded( dense_matrix_fixture, shard_validation, dtype, M, K, shard_sizes, shard_dim, expected_shapes, ) compressed-tensors-0.9.4/tests/test_compressors/sparse_quantized_compressors/000077500000000000000000000000001500222531600301575ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_compressors/sparse_quantized_compressors/__init__.py000066400000000000000000000011511500222531600322660ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py000066400000000000000000000101311500222531600333530ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict import pytest import torch from compressed_tensors.compressors import ( BaseCompressor, Marlin24Compressor, map_modules_to_quant_args, ) from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import ( QuantizationArgs, QuantizationConfig, QuantizationScheme, QuantizationStatus, QuantizationStrategy, apply_quantization_config, ) from compressed_tensors.utils import mask_creator, merge_names from torch.nn.modules import Linear, Sequential def get_2_4_quant_config(num_bits, strategy, ignore): gs = 128 if strategy is QuantizationStrategy.GROUP else None weights = QuantizationArgs(num_bits=num_bits, strategy=strategy, group_size=gs) scheme = QuantizationScheme(weights=weights, targets=["Linear"]) config = QuantizationConfig(config_groups={"group_0": scheme}, ignore=ignore) return config def test_marlin_registered(): config_name = CompressionFormat.marlin_24.value compressor = BaseCompressor.load_from_registry(config_name) assert isinstance(compressor, Marlin24Compressor) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize( "strategy", [QuantizationStrategy.GROUP, QuantizationStrategy.CHANNEL] ) @pytest.mark.parametrize("layer_shape", [(512, 128), (1024, 1024), (128, 256)]) def test_marlin24_format( mock_per_group_calibration, mock_per_channel_calibration, num_bits, strategy, layer_shape, ): QUANT_NAME = "quant" NOT_QUANT_NAME = "not_quant" model = Sequential( OrderedDict( [ (QUANT_NAME, Linear(layer_shape[0], layer_shape[1], bias=False)), (NOT_QUANT_NAME, Linear(layer_shape[1], 64, bias=False)), ] ) ) config = get_2_4_quant_config(num_bits, strategy, ignore=[NOT_QUANT_NAME]) mask = mask_creator(model.quant.weight.data).bool() model.quant.weight.data *= mask apply_quantization_config(model, config) model.quantization_status = QuantizationStatus.CALIBRATION # runs observer to get scale and zero point if strategy == QuantizationStrategy.GROUP: mock_per_group_calibration( model.quant, base_name="weight", value=model.quant.weight, group_size=128 ) if strategy == QuantizationStrategy.CHANNEL: mock_per_channel_calibration( model.quant, base_name="weight", value=model.quant.weight ) state_dict = model.state_dict() assert len(state_dict) == 4 assert f"{NOT_QUANT_NAME}.weight_scale" not in state_dict assert f"{QUANT_NAME}.weight_scale" in state_dict model_to_quant_args = map_modules_to_quant_args(model) compressor = Marlin24Compressor() compressor.validate_quant_compatability(model_to_quant_args) compressor.validate_sparsity_structure( QUANT_NAME, state_dict[f"{QUANT_NAME}.weight"] ) with pytest.raises(ValueError): compressor.validate_sparsity_structure( NOT_QUANT_NAME, state_dict[f"{NOT_QUANT_NAME}.weight"] ) compressor = Marlin24Compressor() compressed_state_dict = compressor.compress(state_dict, model_to_quant_args) assert len(compressed_state_dict) == 4 assert torch.equal( state_dict[f"{NOT_QUANT_NAME}.weight"], compressed_state_dict[f"{NOT_QUANT_NAME}.weight"], ) for param_name in compressor.compression_param_names: full_param_name = merge_names(QUANT_NAME, param_name) assert full_param_name in compressed_state_dict compressed-tensors-0.9.4/tests/test_configs/000077500000000000000000000000001500222531600212105ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_configs/__init__.py000066400000000000000000000011511500222531600233170ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_configs/test_base.py000066400000000000000000000043151500222531600235360ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from compressed_tensors.config import SparsityStructure def test_sparsity_structure_valid_cases(): assert ( SparsityStructure("2:4") == SparsityStructure.TWO_FOUR ), "Failed to match '2:4' with TWO_FOUR" assert ( SparsityStructure("unstructured") == SparsityStructure.UNSTRUCTURED ), "Failed to match 'unstructured' with UNSTRUCTURED" assert ( SparsityStructure("UNSTRUCTURED") == SparsityStructure.UNSTRUCTURED ), "Failed to match 'UNSTRUCTURED' with UNSTRUCTURED" assert ( SparsityStructure(None) == SparsityStructure.UNSTRUCTURED ), "Failed to match None with UNSTRUCTURED" def test_sparsity_structure_invalid_case(): with pytest.raises(ValueError, match="invalid is not a valid SparsityStructure"): SparsityStructure("invalid") def test_sparsity_structure_case_insensitivity(): assert ( SparsityStructure("2:4") == SparsityStructure.TWO_FOUR ), "Failed to match '2:4' with TWO_FOUR" assert ( SparsityStructure("2:4".upper()) == SparsityStructure.TWO_FOUR ), "Failed to match '2:4'.upper() with TWO_FOUR" assert ( SparsityStructure("unstructured".upper()) == SparsityStructure.UNSTRUCTURED ), "Failed to match 'unstructured'.upper() with UNSTRUCTURED" assert ( SparsityStructure("UNSTRUCTURED".lower()) == SparsityStructure.UNSTRUCTURED ), "Failed to match 'UNSTRUCTURED'.lower() with UNSTRUCTURED" def test_sparsity_structure_default_case(): assert ( SparsityStructure(None) == SparsityStructure.UNSTRUCTURED ), "Failed to match None with UNSTRUCTURED" compressed-tensors-0.9.4/tests/test_examples/000077500000000000000000000000001500222531600213765ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_examples/test_bitmask_compression_ipynb.py000066400000000000000000000023341500222531600302650ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import nbformat import pytest from nbconvert.preprocessors import ExecutePreprocessor @pytest.mark.skip( reason="GHA not setup yet to run those tests. The test should work locally" ) @pytest.mark.parametrize("notebook", ["examples/bitmask_compression.ipynb"]) def test_notebook_exec(notebook): with open(notebook) as f: nb = nbformat.read(f, as_version=4) ep = ExecutePreprocessor(timeout=600, kernel_name="python3") try: assert ep.preprocess(nb) is not None, f"Got empty notebook for {notebook}" except Exception: assert False, f"Failed executing {notebook}" compressed-tensors-0.9.4/tests/test_linear/000077500000000000000000000000001500222531600210325ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_linear/__init__.py000066400000000000000000000011511500222531600231410ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_linear/test_compressed_linear.py000066400000000000000000000067261500222531600261540ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.linear.compressed_linear import CompressedLinear from transformers import AutoModelForCausalLM, AutoTokenizer def models_with_linear_quantized(): return [ # weights packed "nm-testing/llama2.c-stories110M-gsm8k-recipe_w4a16_actorder_weight-compressed", # weights not packed "nm-testing/llama2.c-stories110M-gsm8k-fp8_dynamic-compressed", ] @pytest.mark.parametrize("model_stub", models_with_linear_quantized()) def test_model_forward_pass(model_stub): """ Test that AutoModelForCausalLM can process tokenized inputs and generate output. """ # Load model model = AutoModelForCausalLM.from_pretrained( model_stub, torch_dtype=torch.float16, device_map="auto" ) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_stub) # Define sample input sample_inputs = [ "I love quantization because", "What is the capital of France?", "def fibonacci(n):", ] # Move inputs to the correct device device = next(model.parameters()).device inputs = tokenizer(sample_inputs, return_tensors="pt", padding=True).to(device) # Run model inference (forward pass) outputs = model.generate(**inputs, max_length=50) # Ensure output is not empty assert outputs is not None, "Model forward pass failed, no output generated." @pytest.mark.parametrize("model_stub", models_with_linear_quantized()) def test_compressed_linear_from_linear_usage(monkeypatch, model_stub): """ Test that CompressedLinear.from_linear is used for creating CompressedLinear instances. """ call_count = 0 original_from_linear = CompressedLinear.from_linear def fake_from_linear(*args, **kwargs): nonlocal call_count call_count += 1 return original_from_linear(*args, **kwargs) # Replace the original from_linear with our fake to count its invocations monkeypatch.setattr(CompressedLinear, "from_linear", fake_from_linear) # Load model to trigger the creation of CompressedLinear instances model = AutoModelForCausalLM.from_pretrained( model_stub, torch_dtype="auto", device_map="auto" ) # Known quantized layers that should be # instances of CompressedLinear # (This is not an exhaustive list) quantized_layers = {"q_proj", "k_proj", "v_proj"} # Check that the expected layers are instances of CompressedLinear for layer_name, module in model.named_modules(): if any(layer in layer_name for layer in quantized_layers): assert isinstance( module, CompressedLinear ), f"{layer_name} should be an instance of CompressedLinear" f"but got {type(module).__name__}" assert call_count > 0, "`CompressedLinear.from_linear` was not used during the " "creation of CompressedLinear instances." compressed-tensors-0.9.4/tests/test_quantization/000077500000000000000000000000001500222531600223065ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_quantization/__init__.py000066400000000000000000000011511500222531600244150ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_quantization/lifecycle/000077500000000000000000000000001500222531600242455ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_quantization/lifecycle/__init__.py000066400000000000000000000011511500222531600263540ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_quantization/lifecycle/conftest.py000066400000000000000000000031041500222531600264420ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List, Optional import pytest import torch from compressed_tensors.quantization.quant_args import QuantizationArgs from compressed_tensors.quantization.quant_config import QuantizationStatus from compressed_tensors.quantization.quant_scheme import QuantizationScheme @pytest.fixture def create_quantization_scheme(): def quantization_scheme( targets: List[str], weights: Optional[QuantizationArgs] = None, input_activations: Optional[QuantizationArgs] = None, output_activations: Optional[QuantizationArgs] = None, ): return QuantizationScheme( targets=targets, weights=weights, input_activations=input_activations, output_activations=output_activations, ) return quantization_scheme @pytest.fixture def mock_frozen(): def update_status(model: torch.nn.Module): model.quantization_status = QuantizationStatus.FROZEN return update_status compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_apply.py000066400000000000000000000305271500222531600270120ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from collections import defaultdict from typing import Optional from unittest.mock import MagicMock import pytest import torch from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import ( DEFAULT_QUANTIZATION_METHOD, QuantizationConfig, QuantizationStatus, ) from compressed_tensors.quantization.lifecycle import ( apply_quantization_config, apply_quantization_status, expand_target_names, is_target, ) from compressed_tensors.quantization.utils import iter_named_leaf_modules from tests.testing_utils import requires_accelerate from transformers import AutoModelForCausalLM @pytest.fixture def mock_model(): model = MagicMock() model.named_modules.return_value = [ ("layer1", MagicMock()), ("layer2", MagicMock()), ("layer3", MagicMock()), ] return model @pytest.fixture def mock_module(): return MagicMock() @pytest.fixture def llama_stories_model(): return AutoModelForCausalLM.from_pretrained( "Xenova/llama2.c-stories15M", torch_dtype="auto", ) def test_target_prioritization(mock_frozen): # tests that the config_groups are applied in the correct order # of priority, where exact layer name > regex > module name config = { "quant_method": "compressed-tensors", "format": "fakequant", "config_groups": { "group_1": { "weights": { "num_bits": 8, }, "targets": ["Linear"], }, "group_2": { "weights": { "num_bits": 4, }, "targets": ["re:.*down_proj"], }, "group_3": { "weights": { "num_bits": 2, }, "targets": ["model.layers.0.mlp.down_proj"], }, }, } model = AutoModelForCausalLM.from_pretrained( "HuggingFaceM4/tiny-random-LlamaForCausalLM", torch_dtype="auto" ) model.eval() config = QuantizationConfig(**config) config.quantization_status = QuantizationStatus.CALIBRATION apply_quantization_config(model, config) mock_frozen(model) for name, module in iter_named_leaf_modules(model): if name == "model.layers.0.mlp.down_proj": assert module.quantization_scheme.weights.num_bits == 2 elif re.match(".*down_proj", name): assert module.quantization_scheme.weights.num_bits == 4 elif isinstance(module, torch.nn.Linear): assert module.quantization_scheme.weights.num_bits == 8 def test_apply_quantization_config_tinyllama(): quant_config = get_sample_tinyllama_quant_config(status="calibration") model = get_tinyllama_model() # check that model is not already quantized for module in model.modules(): _test_layer_quantization_status(module, inputs=False, weights=False) count_layer_names = ("Linear", "Embeddidng", "LlamaRotaryEmbedding") count_layer_num = defaultdict(int) for name, module in model.named_modules(): if name in quant_config.ignore: continue module_type = module.__class__.__name__ if module_type in count_layer_names: count_layer_num[module_type] += 1 assert len(count_layer_num) > 0, f"None of {count_layer_names} found in model" assert all(value > 0 for value in count_layer_num.values()) # apply quant config to model apply_quantization_config(model, quant_config) # check for correct application of quant config for name, module in model.named_modules(): if name in quant_config.ignore: continue module_type = module.__class__.__name__ if module_type in count_layer_names: count_layer_num[module_type] -= 1 _inputs = module_type == "Linear" _weights = not module_type == "LlamaRotaryEmbedding" _test_layer_quantization_status(module, inputs=_inputs, weights=_weights) assert all( value == 0 for value in count_layer_num.values() ), "Not all values are zero" # test quantization compression # sample forward pass to fill scales, zps model(torch.zeros((1, 1), dtype=int), torch.zeros((1, 1), dtype=int)) apply_quantization_status(model, QuantizationStatus.COMPRESSED) for name, module in model.named_modules(): if name in quant_config.ignore: continue module_type = module.__class__.__name__ if module_type == "Linear": _test_layer_quantization_status( module, inputs=True, weights=True, expected_status=QuantizationStatus.COMPRESSED, expected_dtype=torch.int8, ) def test_serialize_config_tinyllama(): quant_config = get_sample_tinyllama_quant_config() model = get_tinyllama_model() # check that model is not already quantized for module in model.modules(): _test_layer_quantization_status(module, inputs=False, weights=False) # apply quant config to model apply_quantization_config(model, quant_config) serialized_config = QuantizationConfig.from_pretrained(model) assert len(serialized_config.config_groups) == 2 assert serialized_config.config_groups["group_0"].targets == ["Embedding"] assert serialized_config.config_groups["group_0"].input_activations is None assert serialized_config.config_groups["group_1"].targets == ["Linear"] assert serialized_config.config_groups["group_1"].input_activations is not None assert serialized_config.format == CompressionFormat.dense.value assert serialized_config.quant_method == DEFAULT_QUANTIZATION_METHOD assert serialized_config.ignore == ["model.layers.1.mlp.down_proj"] if serialized_config.global_compression_ratio is not None: assert serialized_config.global_compression_ratio > 1.0 assert serialized_config.global_compression_ratio < 8.0 def _test_layer_quantization_status( module, inputs: bool, weights: bool, expected_status: Optional[QuantizationStatus] = None, expected_dtype: Optional[torch.dtype] = None, ): # check if quantization is applied at all (true if inputs or weights targeted) quantized = inputs or weights assert hasattr(module, "quantization_scheme") == quantized assert hasattr(module, "quantization_status") == quantized if expected_status is not None: assert module.quantization_status is expected_status # check inputs matches expected assert hasattr(module, "input_scale") == inputs assert hasattr(module, "input_zero_point") == inputs # check weights matches expected assert hasattr(module, "weight_scale") == weights assert hasattr(module, "weight_zero_point") == weights if weights and expected_dtype is not None: assert module.weight.dtype is expected_dtype def get_tinyllama_model(): return AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", torch_dtype="auto", ) def get_sample_tinyllama_quant_config(status: str = "frozen"): config_dict = { "quant_method": "compressed-tensors", "format": "fakequant", "quantization_status": status, "global_compression_ratio": None, "config_groups": { "group_1": { "weights": { "num_bits": 8, "type": "int", "symmetric": True, "strategy": "tensor", }, "input_activations": { "num_bits": 8, "type": "int", "symmetric": True, "strategy": "tensor", }, "targets": ["Linear"], }, "group_2": { "weights": { "num_bits": 8, "type": "int", "symmetric": False, "strategy": "tensor", }, "input_activations": None, "targets": ["Embedding"], }, }, "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"], } return QuantizationConfig.model_validate(config_dict) @requires_accelerate() @pytest.mark.parametrize( "ignore,should_raise_warning", [ [("lm_head", "re:.*gate"), False], [("lm_head", "re:.*foobarbaz"), True], ], ) def test_apply_quantization_status(caplog, ignore, should_raise_warning): import logging # load a dense, unquantized tiny llama model model = get_tinyllama_model() quantization_config_dict = { "quant_method": "sparseml", "format": "pack-quantized", "global_compression_ratio": None, "config_groups": { "group_1": { "weights": { "num_bits": 4, "type": "int", "symmetric": False, "strategy": "tensor", }, "targets": ["Linear"], } }, } quantization_config_dict["ignore"] = ignore config = QuantizationConfig(**quantization_config_dict) config.quantization_status = QuantizationStatus.CALIBRATION # mismatch in the ignore key of quantization_config_dict with caplog.at_level(logging.WARNING): apply_quantization_config(model, config) if should_raise_warning: assert len(caplog.text) > 0 else: assert len(caplog.text) == 0 @pytest.mark.parametrize( "targets, ignore, expected_targets", [ ([], [], set()), (["layer1", "layer2"], [], {"layer1", "layer2"}), ([], ["layer1"], set()), (["layer1", "layer2"], ["layer2"], {"layer1"}), (["re:layer.*"], ["layer3"], {"layer1", "layer2"}), ], ) def test_expand_targets_with_mock(mock_model, targets, ignore, expected_targets): expanded_targets = expand_target_names(mock_model, targets, ignore) assert expanded_targets == expected_targets @pytest.mark.parametrize( "targets, ignore, expected_targets", [ ( ["re:model.layers.[01].self_attn.q_proj"], ["re:model.layers.1.self_attn.q_proj"], set(["model.layers.0.self_attn.q_proj"]), ), ( ["re:model.layers.[01].self_attn.q_proj"], [], set(["model.layers.0.self_attn.q_proj", "model.layers.1.self_attn.q_proj"]), ), ( ["re:model.layers.[0-2].self_attn.q_proj"], ["re:model.layers.1.self_attn.q_proj"], set(["model.layers.0.self_attn.q_proj", "model.layers.2.self_attn.q_proj"]), ), ( ["model.layers.0.self_attn.q_proj"], ["model.layers.0.self_attn.q_proj"], set(), ), ( ["re:model.layers.*.self_attn.q_proj"], ["re:model.layers.[01].self_attn.q_proj"], set( f"model.layers.{layer_idx}.self_attn.q_proj" for layer_idx in range(2, 6) ), ), ], ) def test_expand_targets_with_llama_stories( llama_stories_model, targets, ignore, expected_targets ): expanded_targets = expand_target_names(llama_stories_model, targets, ignore) assert expanded_targets == expected_targets @pytest.mark.parametrize( "name, targets, ignore, expected", [ ("layer1", ["layer1"], [], True), ("layer1", ["layer1"], ["layer1"], False), ("layer1", ["layer2"], [], False), ("layer1", ["re:layer.*"], [], True), ("layer1", ["re:layer.*"], ["re:layer1"], False), ], ) def test_is_target_with_mock(mock_module, name, targets, ignore, expected): result = is_target(name, mock_module, targets, ignore) assert result == expected compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_dynamic_lifecycle.py000066400000000000000000000103321500222531600313200ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch from compressed_tensors.quantization.lifecycle import apply_quantization_config from compressed_tensors.quantization.quant_config import QuantizationConfig from transformers import AutoModelForCausalLM def test_apply_tinyllama_dynamic_activations(): # NOTE: should not calibrate dynamic quant quant_config = get_sample_dynamic_tinyllama_quant_config() model = get_tinyllama_model() # check that model is not already quantized for module in model.modules(): _test_layer_dynamic_quantization_status(module, inputs=False, weights=False) # apply quant config to model apply_quantization_config(model, quant_config) # test linears are dynamically quantized for calibration _test_linears_dynamic_quantization_status(model, quant_config, frozen=False) # verify forward works w/ dynamic during calibration model(torch.zeros((1, 1), dtype=int), torch.zeros((1, 1), dtype=int)) _test_linears_dynamic_quantization_status(model, quant_config, frozen=True) # verify forward works w/ dynamic after freeze model(torch.zeros((1, 1), dtype=int), torch.zeros((1, 1), dtype=int)) def _test_linears_dynamic_quantization_status(model, quant_config, frozen: bool): # check for correct application of quant config num_linears = 0 for name, module in model.named_modules(): if name in quant_config.ignore: continue module_type = module.__class__.__name__ if module_type == "Linear": num_linears += 1 _test_layer_dynamic_quantization_status( module, inputs=True, weights=True, frozen=frozen ) # sanity check correct number of layers targeted assert num_linears == 154 # 155 Linear layers - 1 that gets ignored def _test_layer_dynamic_quantization_status( module, inputs: bool, weights: bool, frozen: bool = False ): # check if quantization is applied at all (true if inputs or weights targeted) quantized = inputs or weights assert hasattr(module, "quantization_scheme") == quantized assert hasattr(module, "quantization_status") == quantized # check inputs always have an observer if quantized but never scale/zp assert not hasattr(module, "input_scale") assert not hasattr(module, "input_zero_point") assert not hasattr(module, "input_observer") # check weights always have scale/zp and observer only if not frozen assert hasattr(module, "weight_scale") == weights assert hasattr(module, "weight_zero_point") == weights def get_tinyllama_model(): return AutoModelForCausalLM.from_pretrained( "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", torch_dtype="auto", ) def get_sample_dynamic_tinyllama_quant_config(): config_dict = { "quant_method": "compressed-tensors", "format": "fakequant", "quantization_status": "calibration", "global_compression_ratio": None, "config_groups": { "group_1": { "weights": { "num_bits": 8, "type": "int", "symmetric": True, "strategy": "tensor", "dynamic": False, }, "input_activations": { "num_bits": 8, "type": "int", "symmetric": True, "strategy": "tensor", "dynamic": True, }, "targets": ["Linear"], }, }, "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"], } return QuantizationConfig.model_validate(config_dict) compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_enabled.py000066400000000000000000000035021500222531600272500ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from copy import deepcopy import torch from compressed_tensors.quantization import ( QuantizationConfig, apply_quantization_config, disable_quantization, enable_quantization, ) from torch.nn import Linear def test_quantization_enabled_disabled(): inp = torch.randn(16) model = Linear(16, 16) quantized_model = deepcopy(model) apply_quantization_config( model=quantized_model, config=QuantizationConfig( config_groups=dict(W8A8=["Linear"]), quantization_status="calibration", ), ) # run one calibration pass quantized_model(inp) model_output = model(inp) quantized_model_output = quantized_model(inp) # quantized and non quantized outputs should be different assert not torch.all(model_output == quantized_model_output) # disable quantization quantized_model.apply(disable_quantization) # check that quantized model now matches model output assert torch.all(model_output == quantized_model(inp)) # re-enable quantization quantized_model.apply(enable_quantization) # check that quantized model matches original quantized output assert torch.all(quantized_model_output == quantized_model(inp)) compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_forward.py000066400000000000000000000143761500222531600273350ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.quantization.lifecycle.forward import ( dequantize, forward_quantize, quantize, wrap_module_forward_quantized, ) from compressed_tensors.quantization.lifecycle.initialize import ( initialize_module_for_quantization, ) from compressed_tensors.quantization.quant_args import ( QuantizationArgs, QuantizationStrategy, ) from compressed_tensors.quantization.quant_config import QuantizationStatus from torch.nn import Linear def make_dummy_g_idx(columns: int, group_size: int) -> torch.Tensor: perm = torch.randperm(columns) return torch.tensor([index // group_size for index in range(columns)])[perm] def test_wrap_module_forward_quantized(create_quantization_scheme): num_bits = 8 quantization_scheme = create_quantization_scheme( targets=["*"], weights=QuantizationArgs(num_bits=num_bits, symmetric=True), input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), ) layer = Linear(4, 4) func_forward = layer.forward.__func__ # check that the forward call is overwritten wrap_module_forward_quantized(layer, quantization_scheme) assert not func_forward == layer.forward.__func__ @pytest.mark.parametrize("quantization_status", ["initialized", "calibration"]) def test_forward_quantize( mock_per_tensor_calibration, create_quantization_scheme, quantization_status ): num_bits = 8 quantization_scheme = create_quantization_scheme( targets=["*"], weights=QuantizationArgs(num_bits=num_bits, symmetric=True), input_activations=QuantizationArgs(num_bits=num_bits, symmetric=True), ) quantization_args = QuantizationArgs(num_bits=num_bits, symmetric=True) layer = Linear(4, 4) layer.weight.data *= 100 dummy_tensor = torch.randn(8, 4) # (num_tokens, num_features) layer.quantization_status = QuantizationStatus(quantization_status) # only calibration updates the scale and zero-point if layer.quantization_status == QuantizationStatus.INITIALIZED: # Init zp and scales initialize_module_for_quantization(layer, quantization_scheme) # mock weight calibration mock_per_tensor_calibration(layer, "weight", value=layer.weight.data) # call quant/dequant on weights out = forward_quantize(layer, layer.weight, "weight", quantization_args) assert torch.allclose(out, layer.weight.data, atol=0.2) elif layer.quantization_status == QuantizationStatus.CALIBRATION: # init zp/scales initialize_module_for_quantization(layer, quantization_scheme) # run weight and input calibration mock_per_tensor_calibration(layer, "weight", value=layer.weight.data) mock_per_tensor_calibration(layer, "input", value=dummy_tensor) # call quant/dequant on inputs out = forward_quantize(layer, dummy_tensor, "input", quantization_args) assert torch.allclose(out, dummy_tensor, atol=0.2) @pytest.mark.parametrize( "num_bits,type,strategy,group_size,scale,zero_point,g_idx", [ ( 4, "int", QuantizationStrategy.TENSOR, None, torch.rand((1,)) * 0.01, torch.zeros((1,)), None, ), ( 4, "int", QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1)), None, ), ( 4, "int", QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1)), make_dummy_g_idx(1024, 128), ), ( 8, "float", QuantizationStrategy.TENSOR, None, torch.rand((1,)) * 0.01, torch.zeros((1,)), None, ), ( 8, "float", QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1)), None, ), ( 8, "float", QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1)), make_dummy_g_idx(1024, 128), ), ], ) def test_quantize(num_bits, type, strategy, group_size, scale, zero_point, g_idx): args = QuantizationArgs( num_bits=num_bits, type=type, strategy=strategy, group_size=group_size ) x = torch.rand((512, 1024)) quantize( x=x, scale=scale, zero_point=zero_point, args=args, dtype=args.pytorch_dtype(), g_idx=g_idx, ) @pytest.mark.parametrize( "num_bits,type,strategy,group_size,scale,zero_point,g_idx", [ ( 8, "int", QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1)), None, ), ( 8, "int", QuantizationStrategy.GROUP, 128, torch.rand((512, 8, 1)) * 0.01, torch.zeros((512, 8, 1)), make_dummy_g_idx(1024, 128), ), ], ) def test_dequantize(num_bits, type, strategy, group_size, scale, zero_point, g_idx): args = QuantizationArgs( num_bits=num_bits, type=type, strategy=strategy, group_size=group_size ) x_q = torch.rand((512, 1024)).to(dtype=args.pytorch_dtype()) dequantize( x_q=x_q, scale=scale, zero_point=zero_point, args=args, dtype=None, g_idx=g_idx, ) compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_helpers.py000066400000000000000000000033111500222531600273160ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.utils import safe_permute from compressed_tensors.utils.permute import _EXPERIMENTAL_DTYPES @pytest.mark.parametrize( "dtype,device,exp_experimental", [ (torch.int8, torch.device("cpu"), False), (torch.int16, torch.device("cpu"), False), (torch.int32, torch.device("cpu"), False), (torch.int64, torch.device("cpu"), False), (torch.float16, torch.device("cpu"), False), (torch.float32, torch.device("cpu"), False), (torch.float64, torch.device("cpu"), False), (torch.float8_e4m3fn, torch.device("cpu"), True), ], ) def test_safe_permute(dtype: torch.dtype, device: str, exp_experimental: bool): # some dtypes do not support arange initialization tensor = torch.tensor([0, 1, 2, 3], dtype=dtype, device=device) perm = torch.tensor([3, 1, 0, 2]) expected = torch.tensor([3, 1, 0, 2], dtype=dtype, device=device) result = safe_permute(tensor, perm, dim=0) if exp_experimental: assert (dtype, device) in _EXPERIMENTAL_DTYPES assert all(result == expected) compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_initialize.py000066400000000000000000000134731500222531600300270ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from compressed_tensors.quantization import ( ActivationOrdering, QuantizationArgs, QuantizationScheme, QuantizationStatus, QuantizationStrategy, ) from compressed_tensors.quantization.lifecycle.initialize import ( initialize_module_for_quantization, ) from tests.testing_utils import requires_accelerate from torch.nn import Linear NUM_BITS = 8 Q_PARAM_NAMES = { "input_activations": "input", "weights": "weight", "output_activations": "output", } @pytest.fixture def layer(): return Linear(4, 4) @pytest.mark.parametrize( "weights,input_activations", [ ( QuantizationArgs(num_bits=NUM_BITS, symmetric=True), None, ), ( None, QuantizationArgs(num_bits=NUM_BITS, symmetric=True), ), ( QuantizationArgs(num_bits=NUM_BITS, symmetric=True), QuantizationArgs(num_bits=NUM_BITS, symmetric=True), ), ], ) def test_initialize_module_for_quantization( create_quantization_scheme, weights, input_activations, layer ): quantization_scheme = create_quantization_scheme( targets=["*"], weights=weights, input_activations=input_activations, ) assert not hasattr(layer, "quantization_scheme") assert not hasattr(layer, "quantization_status") # add attributes, zero_points and scale initialize_module_for_quantization(layer, quantization_scheme) registered_params = {"weight", "bias"} if weights is not None: registered_params.add("weight_scale") registered_params.add("weight_zero_point") if input_activations is not None: registered_params.add("input_scale") registered_params.add("input_zero_point") for key in layer.state_dict().keys(): assert key in registered_params registered_params.remove(key) assert len(registered_params) == 0 assert hasattr(layer, "quantization_scheme") assert hasattr(layer, "quantization_status") assert layer.quantization_status == QuantizationStatus.INITIALIZED @requires_accelerate() @pytest.mark.parametrize( "weights,input_activations", [ ( QuantizationArgs(num_bits=NUM_BITS, symmetric=True), None, ), ( None, QuantizationArgs(num_bits=NUM_BITS, symmetric=True), ), ( QuantizationArgs(num_bits=NUM_BITS, symmetric=True), QuantizationArgs(num_bits=NUM_BITS, symmetric=True), ), ], ) def test_initialize_module_for_quantization_offloaded( create_quantization_scheme, weights, input_activations, layer ): from accelerate.hooks import attach_align_device_hook attach_align_device_hook(layer, offload=True) test_initialize_module_for_quantization( create_quantization_scheme, weights, input_activations, layer, ) @pytest.mark.parametrize( "weights,input_activations", [ ( QuantizationArgs(strategy="tensor"), QuantizationArgs(strategy="tensor"), ), ( QuantizationArgs(strategy="channel"), None, ), ( QuantizationArgs(strategy="group", group_size=2), None, ), ( QuantizationArgs(strategy="group", group_size=2, actorder="group"), None, ), ( QuantizationArgs(strategy="group", group_size=2, actorder="weight"), None, ), ( QuantizationArgs(strategy="block"), QuantizationArgs(strategy="block"), ), ( QuantizationArgs(strategy="token"), QuantizationArgs(strategy="token"), ), ], ) def test_initialize_quantization_parameters(weights, input_activations): quantization_scheme = QuantizationScheme( targets=["*"], weights=weights, input_activations=input_activations, ) layer = Linear(7, 8) initialize_module_for_quantization(layer, quantization_scheme) for q_type in ("input_activations", "weights"): args = getattr(quantization_scheme, q_type) if args is None: continue q_param_name = Q_PARAM_NAMES[q_type] # scale and zero point if args.strategy == QuantizationStrategy.TENSOR: expected_shape = (1,) elif args.strategy == QuantizationStrategy.CHANNEL: # only weight expected_shape = (layer.weight.shape[0], 1) elif args.strategy == QuantizationStrategy.GROUP: # only weight num_groups = layer.weight.shape[1] // args.group_size expected_shape = (layer.weight.shape[0], max(num_groups, 1)) elif args.strategy == QuantizationStrategy.BLOCK: expected_shape = (1,) elif args.strategy == QuantizationStrategy.TOKEN: expected_shape = (1, 1) assert getattr(layer, f"{q_param_name}_scale").shape == expected_shape assert getattr(layer, f"{q_param_name}_zero_point").shape == expected_shape # g_idx if args.actorder == ActivationOrdering.GROUP: assert getattr(layer, f"{q_param_name}_g_idx").shape == ( layer.weight.shape[1], ) compressed-tensors-0.9.4/tests/test_quantization/lifecycle/test_lifecycle.py000066400000000000000000000107711500222531600276230ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from copy import deepcopy import torch from compressed_tensors.quantization.lifecycle.initialize import ( initialize_module_for_quantization, ) from compressed_tensors.quantization.quant_args import QuantizationArgs from compressed_tensors.quantization.quant_config import QuantizationStatus from torch.nn import Linear def test_lifecyle(mock_per_tensor_calibration, create_quantization_scheme): num_bits = 8 quantization_scheme = create_quantization_scheme( input_activations=QuantizationArgs(num_bits=num_bits, symmetric=False), weights=QuantizationArgs(num_bits=num_bits, symmetric=True), targets=["*"], ) layer = Linear(4, 4) layer.weight.data *= 100 # updated layer keys check expected_layer_keys = {"weight", "bias"} for key in layer.state_dict().keys(): expected_layer_keys.remove(key) assert len(expected_layer_keys) == 0 # over write forward pass and register zero_point and scale initialize_module_for_quantization(layer, quantization_scheme) expected_layer_keys = { "input_scale", "input_zero_point", "weight_scale", "weight_zero_point", "weight", "bias", } for key in layer.state_dict().keys(): expected_layer_keys.remove(key) assert len(expected_layer_keys) == 0 assert hasattr(layer, "quantization_scheme") assert hasattr(layer, "quantization_status") assert layer.quantization_status == QuantizationStatus.INITIALIZED assert torch.numel(layer.input_zero_point.data) == 1 assert torch.numel(layer.input_scale) == 1 assert torch.numel(layer.weight_scale) == 1 assert torch.numel(layer.weight_zero_point) == 1 random_input = torch.randn(4, 4) random_input[0][0] = 42 # skew distribution to force non-zero zp # do a calibration step mock_per_tensor_calibration(layer, "weight", value=layer.weight) mock_per_tensor_calibration(layer, "input", value=random_input) # zero-points and scale should be updated after forward pass assert torch.numel(layer.input_zero_point.data) > 0 assert torch.numel(layer.input_scale) > 0 assert torch.numel(layer.weight_scale) > 0 assert torch.numel(layer.weight_zero_point) > 0 # symmetric zero points should center at 0 assert layer.weight_zero_point.data == 0 # check high and low bound of the weights assert torch.all(layer.weight.data >= -128) and torch.all(layer.weight.data <= 127) initialized_layer_input_zero_point = deepcopy(layer.input_zero_point) initialized_layer_input_scale = deepcopy(layer.input_scale) initialized_layer_weight_scale = deepcopy(layer.weight_scale) # calibrate the layers with each iteration for _ in range(10): random_input = torch.randn(4, 4) random_input[0][0] = 42 # skew distribution to force non-zero zp mock_per_tensor_calibration(layer, "weight", value=layer.weight) mock_per_tensor_calibration(layer, "input", value=random_input) assert initialized_layer_input_zero_point != 0 assert initialized_layer_input_scale != layer.input_scale assert initialized_layer_weight_scale == layer.weight_scale # check quantization f_q(x) is applied after frozen without update input_check_for_quant = torch.randn(4, 4) out_calibration = layer(input_check_for_quant) layer_before_freeze_input_zero_point = deepcopy(layer.input_zero_point) layer_before_freeze_input_scale = deepcopy(layer.input_scale) layer_before_freeze_weight_scale = deepcopy(layer.weight_scale) for _ in range(10): layer(torch.randn(4, 4)) assert layer_before_freeze_input_zero_point == layer.input_zero_point assert layer_before_freeze_input_scale == layer.input_scale assert layer_before_freeze_weight_scale == layer.weight_scale # check that the same quantization is applied as calibration to frozen assert torch.all(out_calibration == layer(input_check_for_quant)) compressed-tensors-0.9.4/tests/test_quantization/test_configs/000077500000000000000000000000001500222531600247755ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_quantization/test_configs/__init__.py000066400000000000000000000011511500222531600271040ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_quantization/test_configs/test_bit_depths.py000066400000000000000000000140021500222531600305300ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.quantization import ( QuantizationArgs, QuantizationConfig, QuantizationScheme, QuantizationStatus, apply_quantization_config, ) from compressed_tensors.quantization.lifecycle.forward import fake_quantize, quantize from torch.nn import Linear def create_config(bit_depth, quant_type, input_symmetry, weight_symmetry): weights = QuantizationArgs( num_bits=bit_depth, type=quant_type, symmetric=weight_symmetry ) if input_symmetry is not None: inputs = QuantizationArgs( num_bits=bit_depth, type=quant_type, symmetric=input_symmetry ) else: inputs = None config_groups = { "group_1": QuantizationScheme( targets=["Linear"], weights=weights, input_activations=inputs ) } config = QuantizationConfig( config_groups=config_groups, quantization_status=QuantizationStatus.CALIBRATION ) return config @torch.no_grad @pytest.mark.parametrize("bit_depth", [4, 8]) @pytest.mark.parametrize("quant_type", ["int"]) @pytest.mark.parametrize("input_symmetry", [True, False, None]) @pytest.mark.parametrize("weight_symmetry", [True, False]) def test_bit_depths( mock_per_tensor_calibration, bit_depth, quant_type, input_symmetry, weight_symmetry ): model = Linear(64, 64) quant_config = create_config(bit_depth, quant_type, input_symmetry, weight_symmetry) apply_quantization_config(model, quant_config) min = -1 * int(2**bit_depth / 2) max = int(2**bit_depth / 2) - 1 inputs = torch.randn(32, 64) model.apply( lambda module: mock_per_tensor_calibration( module, base_name="weight", value=model.weight ) ) if input_symmetry is not None: model.apply( lambda module: mock_per_tensor_calibration( module, base_name="input", value=inputs ) ) assert model.input_zero_point >= min assert model.input_zero_point <= max input_max = torch.max(inputs) input_min = torch.min(inputs) diff_from_max = abs( abs(model.input_scale * (max - model.input_zero_point)) - abs(input_max) ) diff_from_min = abs( abs(model.input_scale * abs(min - model.input_zero_point)) - abs(input_min) ) assert diff_from_max < model.input_scale or diff_from_min < model.input_scale assert model.weight_zero_point >= min assert model.weight_zero_point <= max weight_max = torch.max(model.weight) weight_min = torch.min(model.weight) diff_from_max = abs( abs(model.weight_scale * (max - model.weight_zero_point)) - abs(weight_max) ) diff_from_min = abs( abs(model.weight_scale * abs(min - model.weight_zero_point)) - abs(weight_min) ) assert diff_from_max < model.weight_scale or diff_from_min < model.weight_scale quantized_weight = fake_quantize( model.weight, model.weight_scale, model.weight_zero_point, model.quantization_scheme.weights, ) assert not torch.any(quantized_weight < min).item() assert not torch.any(quantized_weight > max).item() @torch.no_grad @pytest.mark.parametrize("bit_depth", [8]) @pytest.mark.parametrize("quant_type", ["float"]) @pytest.mark.parametrize("input_symmetry", [True, False, None]) @pytest.mark.parametrize("weight_symmetry", [True, False]) def test_fp8( mock_per_tensor_calibration, bit_depth, quant_type, input_symmetry, weight_symmetry ): model = Linear(64, 64) quant_config = create_config(bit_depth, quant_type, input_symmetry, weight_symmetry) apply_quantization_config(model, quant_config) dtype_info = torch.finfo(torch.float8_e4m3fn) min = dtype_info.min max = dtype_info.max inputs = torch.randn(32, 64) model.apply( lambda module: mock_per_tensor_calibration( module, base_name="weight", value=model.weight ) ) assert model.weight_zero_point.dtype == torch.float8_e4m3fn model.weight_zero_point.data = model.weight_zero_point.to(model.weight.dtype) if input_symmetry is not None: model.apply( lambda module: mock_per_tensor_calibration( module, base_name="input", value=inputs ) ) assert model.input_zero_point.dtype == torch.float8_e4m3fn model.input_zero_point.data = model.input_zero_point.to(model.weight.dtype) assert model.input_zero_point >= min assert model.input_zero_point <= max inputs_fake_quant = quantize( inputs, model.input_scale, model.input_zero_point, model.quantization_scheme.input_activations, ) input_max = torch.max(inputs_fake_quant) input_min = torch.min(inputs_fake_quant) diff_from_max = abs(input_max - max) diff_from_min = abs(input_min - min) assert diff_from_max.item() == 0.0 or diff_from_min.item() == 0.0 assert model.weight_zero_point >= min assert model.weight_zero_point <= max weight_fake_quant = quantize( model.weight, model.weight_scale, model.weight_zero_point, model.quantization_scheme.weights, ) weight_max = torch.max(weight_fake_quant) weight_min = torch.min(weight_fake_quant) diff_from_max = abs(weight_max - max) diff_from_min = abs(weight_min - min) assert diff_from_max.item() == 0.0 or diff_from_min.item() == 0.0 compressed-tensors-0.9.4/tests/test_quantization/test_configs/test_strategies.py000066400000000000000000000111051500222531600305560ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.quantization import ( QuantizationArgs, QuantizationConfig, QuantizationScheme, QuantizationStatus, QuantizationStrategy, apply_quantization_config, ) from torch.nn import Linear def create_config( input_symmetry, weight_symmetry, w_strategy, i_strategy=None, group_size=None ): weights = QuantizationArgs( symmetric=weight_symmetry, strategy=w_strategy, group_size=group_size ) if input_symmetry is not None: inputs = QuantizationArgs( symmetric=input_symmetry, strategy=i_strategy, group_size=group_size ) else: inputs = None config_groups = { "group_1": QuantizationScheme( targets=["Linear"], weights=weights, input_activations=inputs ) } config = QuantizationConfig( config_groups=config_groups, quantization_status=QuantizationStatus.CALIBRATION ) return config @torch.no_grad @pytest.mark.parametrize("input_symmetry", [None]) @pytest.mark.parametrize("weight_symmetry", [True, False]) @pytest.mark.parametrize("model_shape", [(64, 128), (300, 200), (400, 400)]) def test_channelwise( mock_per_channel_calibration, input_symmetry, weight_symmetry, model_shape ): model = Linear(model_shape[0], model_shape[1]) quant_config = create_config( input_symmetry, weight_symmetry, w_strategy=QuantizationStrategy.CHANNEL ) apply_quantization_config(model, quant_config) inputs = torch.randn(32, model_shape[0]) mock_per_channel_calibration(model, base_name="weight", value=model.weight) if input_symmetry is not None: mock_per_channel_calibration(model, base_name="input", value=inputs) assert model.weight_scale.shape == (model_shape[1], 1) assert model.weight_zero_point.shape == (model_shape[1], 1) @torch.no_grad @pytest.mark.parametrize("input_symmetry", [None]) @pytest.mark.parametrize("weight_symmetry", [True, False]) @pytest.mark.parametrize("model_shape", [(128, 256), (256, 512), (512, 1024)]) @pytest.mark.parametrize("group_size", [32, 128]) def test_group( mock_per_group_calibration, input_symmetry, weight_symmetry, model_shape, group_size ): model = Linear(model_shape[0], model_shape[1]) quant_config = create_config( input_symmetry, weight_symmetry, w_strategy=QuantizationStrategy.GROUP, group_size=group_size, ) apply_quantization_config(model, quant_config) inputs = torch.randn(128, model_shape[0]) mock_per_group_calibration( model, base_name="weight", value=model.weight, group_size=group_size ) if input_symmetry is not None: mock_per_group_calibration( model, base_name="input", value=inputs, group_size=group_size ) assert model.weight_scale.shape == ( model_shape[1], int(model_shape[0] / group_size), ) assert model.weight_zero_point.shape == ( model_shape[1], int(model_shape[0] / group_size), ) @torch.no_grad @pytest.mark.parametrize("input_symmetry", [True, False]) @pytest.mark.parametrize("weight_symmetry", [True, False]) @pytest.mark.parametrize("input_shape", [(32, 256), (300, 200), (400, 400)]) def test_token( mock_per_channel_calibration, mock_per_token_calibration, input_symmetry, weight_symmetry, input_shape, ): model = Linear(input_shape[1], 256) quant_config = create_config( input_symmetry, weight_symmetry, w_strategy=QuantizationStrategy.CHANNEL, i_strategy=QuantizationStrategy.TOKEN, ) apply_quantization_config(model, quant_config) inputs = torch.randn(input_shape) mock_per_channel_calibration(model, base_name="weight", value=model.weight) mock_per_token_calibration(model, base_name="input", value=inputs) assert model.input_scale.shape == (1, 1) assert model.input_zero_point.shape == (1, 1) assert model.weight_scale.shape == (256, 1) assert model.weight_zero_point.shape == (256, 1) compressed-tensors-0.9.4/tests/test_quantization/test_quant_args.py000066400000000000000000000134171500222531600260710ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from compressed_tensors.quantization import ( ActivationOrdering, QuantizationArgs, QuantizationStrategy, QuantizationType, ) from pydantic import ValidationError def test_defaults(): default = QuantizationArgs() assert default.num_bits == 8 assert default.type == QuantizationType.INT assert default.symmetric assert default.strategy == QuantizationStrategy.TENSOR assert default.group_size is None assert default.block_structure is None def test_group(): kwargs = {"strategy": "group", "group_size": 128} group = QuantizationArgs(**kwargs) assert group.strategy == QuantizationStrategy.GROUP assert group.group_size == kwargs["group_size"] with pytest.raises(ValueError): QuantizationArgs(strategy=QuantizationStrategy.GROUP, group_size=-1) args = QuantizationArgs(group_size=128, strategy="group") assert args.group_size == 128 assert args.strategy == "group" with pytest.raises(ValueError): QuantizationArgs(strategy=QuantizationStrategy.GROUP) with pytest.raises(ValueError): QuantizationArgs(strategy="tensor", group_size=128) def test_block(): kwargs = {"strategy": "block", "block_structure": "2x4"} block = QuantizationArgs(**kwargs) assert block.strategy == QuantizationStrategy.BLOCK assert block.block_structure == kwargs["block_structure"] def test_infer_strategy(): args = QuantizationArgs(group_size=128) assert args.strategy == QuantizationStrategy.GROUP args = QuantizationArgs(group_size=-1) assert args.strategy == QuantizationStrategy.CHANNEL def test_enums(): assert QuantizationArgs( type=QuantizationType.INT, strategy=QuantizationStrategy.GROUP, actorder=ActivationOrdering.WEIGHT, group_size=1, ) == QuantizationArgs(type="InT", strategy="GROUP", actorder="weight", group_size=1) def test_actorder(): # test group inference with actorder args = QuantizationArgs(group_size=128, actorder=ActivationOrdering.GROUP) assert args.strategy == QuantizationStrategy.GROUP args = QuantizationArgs(group_size=128, actorder=ActivationOrdering.DYNAMIC) assert args.strategy == QuantizationStrategy.GROUP # test invalid pairings with pytest.raises(ValueError): QuantizationArgs(group_size=None, actorder="group") with pytest.raises(ValueError): QuantizationArgs(group_size=None, actorder="weight") with pytest.raises(ValueError): QuantizationArgs(group_size=None, actorder="static") with pytest.raises(ValueError): QuantizationArgs(group_size=-1, actorder="group") with pytest.raises(ValueError): QuantizationArgs(group_size=-1, actorder="weight") with pytest.raises(ValueError): QuantizationArgs(group_size=-1, actorder="static") with pytest.raises(ValueError): QuantizationArgs(strategy="tensor", actorder="group") with pytest.raises(ValueError): QuantizationArgs(strategy="tensor", actorder="weight") with pytest.raises(ValueError): QuantizationArgs(strategy="tensor", actorder="static") # test boolean and none defaulting assert ( QuantizationArgs(group_size=1, actorder=True).actorder == ActivationOrdering.GROUP ) assert QuantizationArgs(group_size=1, actorder=False).actorder is None assert QuantizationArgs(group_size=1, actorder=None).actorder is None def test_actorder_aliases(): assert ( ActivationOrdering.GROUP == ActivationOrdering.DYNAMIC == ActivationOrdering.GROUP ) assert ( ActivationOrdering.WEIGHT == ActivationOrdering.STATIC == ActivationOrdering.WEIGHT ) assert ActivationOrdering.GROUP == "dynamic" == ActivationOrdering.GROUP assert ActivationOrdering.DYNAMIC == "dynamic" == ActivationOrdering.DYNAMIC assert ActivationOrdering.GROUP == "group" == ActivationOrdering.GROUP assert ActivationOrdering.DYNAMIC == "group" == ActivationOrdering.DYNAMIC assert ActivationOrdering.WEIGHT == "static" == ActivationOrdering.WEIGHT assert ActivationOrdering.STATIC == "static" == ActivationOrdering.STATIC assert ActivationOrdering.WEIGHT == "weight" == ActivationOrdering.WEIGHT assert ActivationOrdering.STATIC == "weight" == ActivationOrdering.STATIC assert ActivationOrdering.WEIGHT != "dynamic" != ActivationOrdering.WEIGHT assert ActivationOrdering.STATIC != "dynamic" != ActivationOrdering.STATIC assert ActivationOrdering.WEIGHT != "group" != ActivationOrdering.WEIGHT assert ActivationOrdering.STATIC != "group" != ActivationOrdering.STATIC assert ActivationOrdering.GROUP != "static" != ActivationOrdering.GROUP assert ActivationOrdering.DYNAMIC != "static" != ActivationOrdering.DYNAMIC assert ActivationOrdering.GROUP != "weight" != ActivationOrdering.GROUP assert ActivationOrdering.DYNAMIC != "weight" != ActivationOrdering.DYNAMIC def test_invalid(): with pytest.raises(ValidationError): QuantizationArgs(type="invalid") with pytest.raises(ValidationError): QuantizationArgs(strategy="invalid") with pytest.raises(ValidationError): QuantizationArgs(strategy=QuantizationStrategy.GROUP) compressed-tensors-0.9.4/tests/test_quantization/test_quant_config.py000066400000000000000000000054301500222531600263760ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from compressed_tensors.quantization import ( DEFAULT_QUANTIZATION_FORMAT, DEFAULT_QUANTIZATION_METHOD, QuantizationConfig, QuantizationScheme, QuantizationStatus, ) from pydantic import ValidationError def test_basic_config(): config_groups = {"group_1": QuantizationScheme(targets=[])} config = QuantizationConfig(config_groups=config_groups) assert config.config_groups == config_groups assert config.quant_method == DEFAULT_QUANTIZATION_METHOD assert config.format == DEFAULT_QUANTIZATION_FORMAT assert config.quantization_status == QuantizationStatus.INITIALIZED assert config.global_compression_ratio is None assert isinstance(config.ignore, list) and len(config.ignore) == 0 def test_full_config(): config_groups = { "group_1": QuantizationScheme(targets=[]), "group_2": QuantizationScheme(targets=[]), } global_compression_ratio = 3.5 ignore = ["model.layers.0"] quantization_status = "compressed" config = QuantizationConfig( config_groups=config_groups, global_compression_ratio=global_compression_ratio, ignore=ignore, quantization_status=quantization_status, ) assert config.config_groups == config_groups assert config.global_compression_ratio == global_compression_ratio assert config.ignore == ignore assert config.quantization_status == QuantizationStatus.COMPRESSED def test_need_config_groups(): with pytest.raises(ValidationError): _ = QuantizationScheme() @pytest.mark.parametrize( "scheme_name", ["W8A8", "W8A16", "W4A16", "FP8"], ) def test_load_scheme_from_preset(scheme_name: str): targets = ["Linear"] config = QuantizationConfig(config_groups={scheme_name: targets}) assert scheme_name in config.config_groups assert isinstance(config.config_groups[scheme_name], QuantizationScheme) assert config.config_groups[scheme_name].targets == targets def test_to_dict(): config_groups = {"group_1": QuantizationScheme(targets=[])} config = QuantizationConfig(config_groups=config_groups) reloaded = QuantizationConfig.model_validate(config.to_dict()) assert config == reloaded compressed-tensors-0.9.4/tests/test_quantization/test_quant_scheme.py000066400000000000000000000037251500222531600264020ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme from pydantic import ValidationError def test_basic_scheme(): targets = ["model.layer.0", "model.layer.3"] weights = QuantizationArgs() scheme = QuantizationScheme(targets=targets, weights=weights) assert scheme.targets == targets assert scheme.weights == weights assert scheme.input_activations is None assert scheme.output_activations is None def test_full_scheme(): targets = ["Linear"] weights = QuantizationArgs() input_activations = QuantizationArgs(num_bits=4) output_activations = QuantizationArgs(num_bits=8, type="float", symmetric=False) scheme = QuantizationScheme( targets=targets, weights=weights, input_activations=input_activations, output_activations=output_activations, ) assert scheme.targets == targets assert scheme.weights == weights assert scheme.input_activations == input_activations assert scheme.output_activations == output_activations def test_needs_targets(): with pytest.raises(ValidationError): _ = QuantizationScheme() def test_defaults(): targets = ["Linear"] output = QuantizationScheme(targets=targets) assert output.weights is None assert output.input_activations is None assert output.output_activations is None compressed-tensors-0.9.4/tests/test_quantization/test_utils/000077500000000000000000000000001500222531600245055ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_quantization/test_utils/test_helpers.py000066400000000000000000000036441500222531600275670ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy from compressed_tensors.quantization.utils import calculate_qparams @pytest.mark.parametrize( "keepdims,strategy,exp_shape", [ ( False, QuantizationStrategy.TENSOR, torch.Size( [ 1, ] ), ), (True, QuantizationStrategy.CHANNEL, torch.Size([1, 1])), (True, QuantizationStrategy.GROUP, torch.Size([1, 1])), ( False, QuantizationStrategy.BLOCK, torch.Size( [ 1, ] ), ), (True, QuantizationStrategy.TOKEN, torch.Size([1, 1])), ], ) def test_calculate_qparams(keepdims, strategy, exp_shape): value = torch.randn(14, 5) min_val = torch.amin(value, dim=tuple(), keepdims=keepdims) max_val = torch.amax(value, dim=tuple(), keepdims=keepdims) if strategy == QuantizationStrategy.GROUP: args = QuantizationArgs(strategy=strategy, group_size=2) else: args = QuantizationArgs(strategy=strategy) scale, zp = calculate_qparams(min_val, max_val, args) assert scale.shape == exp_shape assert zp.shape == exp_shape compressed-tensors-0.9.4/tests/test_registry.py000066400000000000000000000032121500222531600220000ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from compressed_tensors import ( BaseCompressor, BitmaskCompressor, BitmaskConfig, CompressionFormat, DenseCompressor, DenseSparsityConfig, SparsityCompressionConfig, ) @pytest.mark.parametrize( "name,type", [ [CompressionFormat.sparse_bitmask.value, BitmaskConfig], [CompressionFormat.dense.value, DenseSparsityConfig], ], ) def test_configs(name, type): config = SparsityCompressionConfig.load_from_registry(name) assert isinstance(config, type) assert config.format == name @pytest.mark.parametrize( "name,type", [ [CompressionFormat.sparse_bitmask.value, BitmaskCompressor], [CompressionFormat.dense.value, DenseCompressor], ], ) def test_compressors(name, type): compressor = BaseCompressor.load_from_registry( name, config=SparsityCompressionConfig(format="none") ) assert isinstance(compressor, type) assert isinstance(compressor.config, SparsityCompressionConfig) assert compressor.config.format == "none" compressed-tensors-0.9.4/tests/test_utils/000077500000000000000000000000001500222531600207205ustar00rootroot00000000000000compressed-tensors-0.9.4/tests/test_utils/__init__.py000066400000000000000000000011511500222531600230270ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. compressed-tensors-0.9.4/tests/test_utils/test_helpers.py000066400000000000000000000120161500222531600237730ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import pytest import torch from compressed_tensors import load_compressed, save_compressed, save_compressed_model from compressed_tensors.config import BitmaskConfig from safetensors.torch import save_model from transformers import AutoModelForCausalLM @pytest.fixture def tensors(): tensors = {"tensor_1": torch.Tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])} return tensors @pytest.fixture def llama_model(tmp_path): model_name = "neuralmagic/llama2.c-stories110M-pruned50" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", cache_dir=tmp_path ) yield model def test_save_compressed_sparse_bitmask(tmp_path, tensors): save_compressed( tensors, compression_format="sparse-bitmask", save_path=tmp_path / "model.safetensors", ) assert (tmp_path / "model.safetensors").exists() def test_save_compressed_dense(tmp_path, tensors): save_compressed( tensors, compression_format="dense", save_path=tmp_path / "model.safetensors", ) assert (tmp_path / "model.safetensors").exists() def test_save_compressed_no_compression(tmp_path, tensors): save_compressed( tensors, save_path=tmp_path / "model.safetensors", ) assert (tmp_path / "model.safetensors").exists() def test_save_compressed_error(tmp_path): with pytest.raises(Exception): save_compressed({}, "") with pytest.raises(Exception): save_compressed(None, "") with pytest.raises(Exception): save_compressed( tensors, compression_format="this_is_not_a_valid_format", save_path=tmp_path / "model.safetensors", ) def test_load_compressed_sparse_bitmask(tmp_path, tensors): save_compressed( tensors, compression_format="sparse-bitmask", save_path=tmp_path / "model.safetensors", ) compression_config = BitmaskConfig( format="sparse-bitmask", ) loaded_tensors = dict( load_compressed(tmp_path / "model.safetensors", compression_config) ) for key in tensors: assert torch.allclose(tensors[key], loaded_tensors[key]) def test_load_compressed_dense(tmp_path, tensors): save_compressed( tensors, compression_format="dense", save_path=tmp_path / "model.safetensors", ) save_compressed( tensors, save_path=tmp_path / "model_.safetensors", ) loaded_tensors = dict(load_compressed(tmp_path / "model.safetensors")) loaded_tensors_ = dict(load_compressed(tmp_path / "model_.safetensors")) # loaded_tensors should be equal to loaded_tensors_ for key in tensors: assert torch.allclose(loaded_tensors[key], loaded_tensors_[key]) def test_load_compressed_sharded(tmp_path, llama_model): sharded_model_path = tmp_path / "shared_model" llama_model.save_pretrained(sharded_model_path, max_shard_size="2MB") # make sure that model is shared on disk assert len(os.listdir(sharded_model_path)) > 1 loaded_state_dict = dict(load_compressed(sharded_model_path)) for key, value in llama_model.state_dict().items(): if key == "lm_head.weight": # lm_head doesn't have separate weights. # It shares its weight tensor with the token embedding layer. continue assert torch.allclose(value, loaded_state_dict[key]) def test_save_compressed_model(tmp_path, llama_model): path_to_uncompressed = tmp_path / "model_uncompressed.safetensors" path_to_compressed = tmp_path / "model_compressed.safetensors" # save uncompressed model save_model(llama_model, path_to_uncompressed) size_uncompressed_kb = path_to_uncompressed.stat().st_size / 1024 # save compressed model save_compressed_model( llama_model, path_to_compressed, compression_format="sparse-bitmask" ) size_compressed_kb = path_to_compressed.stat().st_size / 1024 # compare that the are the same after loading state_dict_1 = dict(load_compressed(path_to_uncompressed)) state_dict_2 = dict( load_compressed(path_to_compressed, BitmaskConfig(format="sparse-bitmask")) ) assert all( torch.allclose(state_dict_1[key], state_dict_2[key]) for key in state_dict_1 ) # make sure that compressed model is smaller # than uncompressed by roughly 1.14 (value established empirically) assert pytest.approx(size_uncompressed_kb / size_compressed_kb, 0.01) == 1.14 compressed-tensors-0.9.4/tests/test_utils/test_offload.py000066400000000000000000000260321500222531600237460ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest import torch from compressed_tensors.utils import ( align_module_device, delete_offload_parameter, disable_hf_hook, get_execution_device, has_offloaded_params, register_offload_parameter, update_offload_parameter, ) from compressed_tensors.utils.offload import offload_to_weights_map from tests.testing_utils import requires_accelerate, requires_gpu class ExampleModule(torch.nn.Module): def __init__(self): super().__init__() self.a = torch.nn.Parameter(torch.tensor(0).float()) self.b = torch.nn.Parameter(torch.tensor(0).float()) def forward(self, x): return x * self.a + self.b @requires_accelerate() def test_has_offloaded_params(): from accelerate.big_modeling import cpu_offload_with_hook from accelerate.hooks import attach_align_device_hook, remove_hook_from_module module = ExampleModule() assert not has_offloaded_params(module) attach_align_device_hook(module, offload=False) assert not has_offloaded_params(module) remove_hook_from_module(module) module, _ = cpu_offload_with_hook(module) assert not has_offloaded_params(module) remove_hook_from_module(module) attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) assert has_offloaded_params(module) @requires_gpu @requires_accelerate() def test_get_execution_device(): from accelerate import init_empty_weights from accelerate.big_modeling import attach_align_device_hook # no offloading module = ExampleModule() assert get_execution_device(module) == torch.device("cpu") # with offloading attach_align_device_hook(module, torch.device("cuda:0")) assert get_execution_device(module) == torch.device("cuda:0") # in meta context with torch.device("meta"): module = ExampleModule() assert get_execution_device(module) == torch.device("meta") # offloaded in meta context module = ExampleModule() attach_align_device_hook(module, torch.device("cuda:0")) with torch.device("meta"): assert get_execution_device(module) == torch.device("cuda:0") # in empty weights context with init_empty_weights(): module = ExampleModule() assert get_execution_device(module) == torch.device("meta") # offloaded in empty weights context module = ExampleModule() attach_align_device_hook(module, torch.device("cuda:0")) with init_empty_weights(): assert get_execution_device(module) == torch.device("cuda:0") @requires_accelerate() def test_register_offload_parameter(): from accelerate import init_empty_weights from accelerate.hooks import attach_align_device_hook module = ExampleModule() parameter = torch.nn.Parameter(torch.tensor(1.0)) # register a param prior to offloading register_offload_parameter(module, "c", parameter) assert hasattr(module, "c") and module.c == parameter # offloading, check that added param was offloaded attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) assert "c" in module._hf_hook.weights_map # register a param after offloading, check that added param was offloaded register_offload_parameter(module, "d", parameter) assert hasattr(module, "d") and module.d.device == torch.device("meta") assert module._hf_hook.weights_map["d"].device == torch.device("cpu") # added parameters can be onloaded and offloaded with align_module_device(module, execution_device="cpu"): assert module.c.device == torch.device("cpu") assert module.d.device == torch.device("cpu") assert module.c.device == torch.device("meta") assert module.d.device == torch.device("meta") # parameters can be added during onload with align_module_device(module, execution_device="cpu"): register_offload_parameter(module, "e", parameter) assert module.e.device == torch.device("cpu") # parameters can be added before onload and with explicit offload register_offload_parameter(module, "f", parameter, offload_device="cpu") assert module._hf_hook.weights_map["f"].device == torch.device("cpu") with align_module_device(module, execution_device="cpu"): assert module.f.device == torch.device("cpu") assert module._hf_hook.weights_map["f"].device == torch.device("cpu") # parameters registered in the empty init context are still empty with init_empty_weights(): module = ExampleModule() register_offload_parameter(module, "c", parameter) assert module.a.device == module.b.device == module.c.device == torch.device("meta") @requires_accelerate() def test_update_offload_parameter(): from accelerate.hooks import attach_align_device_hook module = ExampleModule() tensor_a = torch.tensor(1.0) tensor_b = torch.tensor(2.0) # can update modules which are not offloaded update_offload_parameter(module, "a", tensor_a) assert module.a == tensor_a # can update modules which are offloaded attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) update_offload_parameter(module, "b", tensor_b) assert module.b.device == torch.device("meta") assert module._hf_hook.weights_map["b"] == tensor_b # data persists across onloading with align_module_device(module, execution_device="cpu"): assert module.a.data == tensor_a assert module.b.data == tensor_b assert module._hf_hook.weights_map["a"] == tensor_a assert module._hf_hook.weights_map["b"] == tensor_b # data persists across offloading assert module.a.device == torch.device("meta") assert module.b.device == torch.device("meta") assert module._hf_hook.weights_map["a"] == tensor_a assert module._hf_hook.weights_map["b"] == tensor_b # can update with differnt shape with warning with pytest.warns(): new_data = torch.tensor([3.0]) update_offload_parameter(module, "a", new_data) assert module._hf_hook.weights_map["a"] == new_data @requires_accelerate() def test_delete_offload_parameter(): from accelerate.hooks import attach_align_device_hook module = ExampleModule() param_c = torch.nn.Parameter(torch.tensor(1.0)) param_d = torch.nn.Parameter(torch.tensor(2.0)) register_offload_parameter(module, "c", param_c) register_offload_parameter(module, "d", param_d) # parameters are deleted delete_offload_parameter(module, "a") delete_offload_parameter(module, "c") assert not hasattr(module, "a") assert hasattr(module, "b") assert not hasattr(module, "c") assert hasattr(module, "d") # parameters and their offload are deleted attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) delete_offload_parameter(module, "b") delete_offload_parameter(module, "d") assert not hasattr(module, "a") assert not hasattr(module, "b") assert not hasattr(module, "c") assert not hasattr(module, "d") assert "a" not in module._hf_hook.weights_map assert "b" not in module._hf_hook.weights_map assert "c" not in module._hf_hook.weights_map assert "d" not in module._hf_hook.weights_map @requires_accelerate() def test_disable_hf_hook(): from accelerate.hooks import attach_align_device_hook module = ExampleModule() def custom_forward(): pass attach_align_device_hook(module, offload=True, weights_map=module.state_dict()) with disable_hf_hook(module): assert not hasattr(module, "_hf_hook") module.forward = custom_forward assert hasattr(module, "_hf_hook") assert module._old_forward == custom_forward @requires_accelerate() def test_disable_hf_hook_model_recurse(): from accelerate.hooks import attach_align_device_hook module0 = ExampleModule() module1 = ExampleModule() module2 = ExampleModule() model = torch.nn.Sequential(module0, torch.nn.Sequential(module1, module2)) attach_align_device_hook(model, offload=True, weights_map=model.state_dict()) with disable_hf_hook(model): assert not hasattr(module0, "_hf_hook") assert not hasattr(module1, "_hf_hook") assert not hasattr(module2, "_hf_hook") assert hasattr(module0, "_hf_hook") assert hasattr(module1, "_hf_hook") assert hasattr(module2, "_hf_hook") @requires_accelerate() def test_offload_to_weights_map(): from accelerate.utils import OffloadedWeightsLoader, PrefixedDataset name = "name" old_value = torch.tensor(0.0) new_value = torch.tensor(1.0) prefix = "prefix" # Dict empty weights_map = {} with pytest.raises(ValueError): offload_to_weights_map(weights_map, name, new_value) offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") assert weights_map[name] == new_value # Dict populated weights_map = {name: old_value} offload_to_weights_map(weights_map, name, new_value) assert weights_map[name] == new_value # OffloadedWeightsLoader[Dict] empty weights_map = OffloadedWeightsLoader({}) with pytest.raises(ValueError): offload_to_weights_map(weights_map, name, new_value) offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") assert weights_map[name] == new_value # OffloadedWeightsLoader[Dict] populated weights_map = OffloadedWeightsLoader({name: old_value}) offload_to_weights_map(weights_map, name, new_value) assert weights_map[name] == new_value # PrefixedDataset[Dict] empty weights_map = PrefixedDataset({}, prefix) with pytest.raises(ValueError): offload_to_weights_map(weights_map, name, new_value) offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") assert weights_map[name] == new_value # PrefixedDataset[Dict] populated weights_map = PrefixedDataset({name: old_value}, prefix) offload_to_weights_map(weights_map, name, new_value) assert weights_map[name] == new_value # PrefixedDataset[OffloadedWeightsLoader[Dict]] empty weights_map = PrefixedDataset(OffloadedWeightsLoader({}), prefix) with pytest.raises(ValueError): offload_to_weights_map(weights_map, name, new_value) offload_to_weights_map(weights_map, name, new_value, offload_device="cpu") assert weights_map[name] == new_value # PrefixedDataset[OffloadedWeightsLoader[Dict]] populated weights_map = PrefixedDataset(OffloadedWeightsLoader({name: old_value}), prefix) offload_to_weights_map(weights_map, name, new_value) assert weights_map[name] == new_value compressed-tensors-0.9.4/tests/test_utils/test_safetensors_load.py000066400000000000000000000052101500222531600256620ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from unittest.mock import patch import pytest from compressed_tensors.utils.safetensors_load import get_nested_weight_mappings mock_weight_mappings = { "layer1.weight": "file1", "layer1.bias": "file2", "layer2.weight": "file3", "layer2.bias": "file4", "layer3.weight": "file5", } @pytest.fixture def mock_get_weight_mappings(): with patch( "compressed_tensors.utils.safetensors_load.get_weight_mappings", return_value=mock_weight_mappings, ): yield @pytest.mark.usefixtures("mock_get_weight_mappings") class TestGetNestedWeightMappings: """ Tests for the get_nested_weight_mappings function in different scenarios, such as single and multiple parameters to nest, and returning other parameters """ def test_single_param(self): params_to_nest = ["weight"] result = get_nested_weight_mappings("dummy_path", params_to_nest) expected = { "layer1": {"weight": "file1"}, "layer2": {"weight": "file3"}, "layer3": {"weight": "file5"}, } assert result == expected def test_multiple_params(self): params_to_nest = ["weight", "bias"] result = get_nested_weight_mappings("dummy_path", params_to_nest) expected = { "layer1": {"weight": "file1", "bias": "file2"}, "layer2": {"weight": "file3", "bias": "file4"}, "layer3": {"weight": "file5"}, } assert result == expected def test_return_other_params(self): params_to_nest = ["weight"] result, other_params = get_nested_weight_mappings( "dummy_path", params_to_nest, return_unmatched_params=True ) expected_nested = { "layer1": {"weight": "file1"}, "layer2": {"weight": "file3"}, "layer3": {"weight": "file5"}, } expected_other = { "layer1.bias": "file2", "layer2.bias": "file4", } assert result == expected_nested assert other_params == expected_other compressed-tensors-0.9.4/tests/testing_utils.py000066400000000000000000000101311500222531600217640ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa import unittest import pytest def compressed_tensors_config_available(): try: from transformers.utils.quantization_config import ( # noqa: F401 CompressedTensorsConfig, ) return True except ImportError: return False def accelerate_availabe(): try: import accelerate # noqa: F401 return True except ImportError: return False _is_compressed_tensors_config_available = compressed_tensors_config_available() _is_accelerate_available = accelerate_availabe() def requires_hf_quantizer(): return pytest.mark.skipif( not _is_compressed_tensors_config_available, reason="requires transformers>=4.45 to support CompressedTensorsHfQuantizer", ) def requires_accelerate(): return pytest.mark.skipif( not _is_accelerate_available, reason="requires accelerate", ) def get_random_mat(M, K, dtype) -> "torch.Tensor": """ :param M: number of rows :param K: number of columns :param dtype: data type of the matrix :return: random matrix of shape (M, K) with non-zero values """ import torch from compressed_tensors.quantization import FP8_DTYPE rand_tensor_dtype = dtype if dtype in [torch.int8, FP8_DTYPE]: rand_tensor_dtype = torch.float16 mat = torch.rand(M, K, dtype=rand_tensor_dtype).cuda() mat = mat.masked_fill_(mat == 0, 1) return mat.to(dtype) def generate_pruned_semi_structured_mat(M, K, dtype) -> "torch.Tensor": """ :param M: number of rows :param K: number of columns :param dtype: data type of the matrix :return: random matrix of shape (M, K) with 2:4 sparsity pattern """ import torch from compressed_tensors.quantization import FP8_DTYPE mask = torch.Tensor([0, 0, 1, 1]).tile((M, K // 4)).bool() rand_tensor_dtype = dtype if dtype in [torch.int8, FP8_DTYPE]: rand_tensor_dtype = torch.float16 mat = torch.rand(M, K, dtype=rand_tensor_dtype) mat = mat.masked_fill_(mat == 0, 1) if dtype == FP8_DTYPE: # some float8_e4m3fn operations are not supported on CPU mat = mat.cuda() mask = mask.cuda() mat = mat * mask return mat.to(dtype) def induce_sparsity(tensor, sparsity_ratio) -> "torch.Tensor": """ Makes a tensor sparse by zeroing out a given fraction of its smallest absolute values. :param: weight_tensor (torch.Tensor): The input weight tensor. :param: sparsity_ratio (float): Fraction of weights to be zeroed (0 <= sparsity_ratio <= 1). :returns: torch.Tensor: Sparse version of the input tensor. """ import torch if not (0 <= sparsity_ratio <= 1): raise ValueError("Sparsity ratio must be between 0 and 1.") # Flatten the tensor and compute the threshold for sparsity flattened = tensor.view(-1) k = int(sparsity_ratio * flattened.numel()) if k > 0: threshold = torch.topk(flattened.abs(), k, largest=False).values.max() sparse_tensor = torch.where( tensor.abs() > threshold, tensor, torch.zeros_like(tensor) ) else: sparse_tensor = tensor return sparse_tensor def is_gpu_available(): """ :return: True if a GPU is available, False otherwise """ try: import torch # noqa: F401 return torch.cuda.device_count() > 0 except ImportError: return False def requires_gpu(test_case): return unittest.skipUnless(is_gpu_available(), "test requires GPU")(test_case) compressed-tensors-0.9.4/utils/000077500000000000000000000000001500222531600165175ustar00rootroot00000000000000compressed-tensors-0.9.4/utils/copyright.py000066400000000000000000000245371500222531600211140ustar00rootroot00000000000000# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import glob import os import sys from typing import List, NamedTuple COPYRIGHT_LINES = [ "Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.", "", 'Licensed under the Apache License, Version 2.0 (the "License");', "you may not use this file except in compliance with the License.", "You may obtain a copy of the License at", "", " http://www.apache.org/licenses/LICENSE-2.0", "", "Unless required by applicable law or agreed to in writing,", 'software distributed under the License is distributed on an "AS IS" BASIS,', "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.", "See the License for the specific language governing permissions and", "limitations under the License.", ] NO_COPYRIGHT_LINE = "neuralmagic: no copyright" QUALITY_COMMAND = "quality" STYLE_COMMAND = "style" def parse_args(): """ Setup and parse command line arguments for using the script """ parser = argparse.ArgumentParser( description=( "Add Neuralmagic copyright to the beginning of all " "files under the given glob patterns. " "Currently assumes Python files using '#' as the commenting prefix." ) ) subparsers = parser.add_subparsers(dest="command") quality_parser = subparsers.add_parser( QUALITY_COMMAND, description=( "Run check across the files in the given patterns and " "fail if any do not have a copyright in them" ), ) style_parser = subparsers.add_parser( STYLE_COMMAND, description=( "Add the copyright to any files in the given patterns if it is not present" ), ) for sub in [quality_parser, style_parser]: sub.add_argument( "patterns", type=str, default=[], nargs="+", help="the patterns to search through", ) return parser.parse_args() def quality(patterns: List[str]): """ Run a quality check across all files in the given glob patterns. This checks to make sure all matching files have the NM copyright present. If any do not, it will list them out and exit with an error. :param patterns: The glob file patterns to run quality check on """ check_files = _get_files(patterns) error_files = [] for file in check_files: if not _dont_copyright(file) and not _contains_copyright(file): print(f"would add copyright to {file}") error_files.append(file) if error_files: sys.exit( f"{len(error_files)} would be copyrighted, " f"{len(check_files) - len(error_files)} would be left unchanged." ) else: print(f"{len(check_files)} files have copyrights") def style(patterns: List[str]): """ Run a style application across all files in the given glob patterns. This checks to make sure all matching files have the NM copyright present. If any do not, it will append the copyright to above the file after any already contained headers such as shebang lines. :param patterns: The glob file patterns to run quality check on """ check_files = _get_files(patterns) copyrighted_files = [] for file in check_files: if not _dont_copyright(file) and not _contains_copyright(file): _add_copyright(file) print(f"copyrighted {file}") copyrighted_files.append(file) if copyrighted_files: print( f"{len(copyrighted_files)} file(s) copyrighted, " f"{len(check_files) - len(copyrighted_files)} files unchanged" ) else: print(f"{len(check_files)} files unchanged") def _get_files(patterns: List[str]) -> List[str]: files = [] for pattern in patterns: for file in glob.glob(pattern, recursive=True): files.append(os.path.abspath(os.path.expanduser(file))) files.sort() return files def _dont_copyright(file_path: str) -> bool: with open(file_path, "r") as file: content = file.read() try: content.index(NO_COPYRIGHT_LINE) return True except ValueError: return False def _contains_copyright(file_path: str) -> bool: with open(file_path, "r") as file: content = file.read() try: for line in COPYRIGHT_LINES: content.index(line) return True except ValueError: return False def _add_copyright(file_path: str): file_type = _file_type(file_path) if file_type == "unknown": raise ValueError( f"unsupported file_type given to be copyrighted at {file_path}" ) with open(file_path, "r+") as file: lines = file.readlines() header_info = _file_header_info(lines, file_type) inject_index = 0 if header_info.end_index > -1: # if there is already a header, we want to inject the copyright after it # additionally we'll need a new line between the prev header and copyright inject_index = header_info.end_index + 1 lines.insert(inject_index, "\n") inject_index += 1 # add the copyright at the inject index file_copyright = _file_copyright(file_type) lines.insert(inject_index, file_copyright) if not header_info.new_line_after: # if there wasn't a new line after the header, # add in a new line after to create space between the code and copyright inject_index += 1 lines.insert(inject_index, "\n") file.seek(0) file.writelines(lines) file.truncate() def _file_copyright(file_type: str) -> str: comment_formatting = _code_comment_formatting(file_type) lines = [] if comment_formatting.block_prefix: lines.append(comment_formatting.block_prefix) for line in COPYRIGHT_LINES: lines.append( f"{comment_formatting.line_prefix} {line}" if comment_formatting.line_prefix else line ) if comment_formatting.block_suffix: lines.append(comment_formatting.block_suffix) # make sure there is a new line after last line of the copyright lines.append("") return "\n".join(lines) _HeaderInfo = NamedTuple( "HeaderInfo", [ ("start_index", int), ("end_index", int), ("new_line_before", bool), ("new_line_after", bool), ], ) def _file_header_info(lines: List[str], file_type: str) -> _HeaderInfo: start_index = -1 end_index = -1 new_line_before = False new_line_after = False comment_formatting = _code_comment_formatting(file_type) prefix_found = False suffix_found = False for index, line in enumerate(lines): line = line.strip() if not line: # empty line, record the state of new lines before and after header if not prefix_found: new_line_before = True elif prefix_found and (suffix_found or not comment_formatting.block_suffix): new_line_after = True elif ( comment_formatting.block_prefix and line.startswith(comment_formatting.block_prefix) ) or ( not comment_formatting.block_prefix and line.startswith(comment_formatting.line_prefix) ): # start of header prefix_found = True start_index = index end_index = index suffix_found = comment_formatting.block_suffix and line.endswith( comment_formatting.block_suffix ) elif comment_formatting.block_suffix and line.endswith( comment_formatting.block_suffix ): # end of header suffix_found = True end_index = index elif prefix_found and comment_formatting.block_suffix and not suffix_found: # in the middle of the header, searching for the end # reset new_line_after in case there was a break in the header new_line_after = True else: # first non header line, break out break return _HeaderInfo(start_index, end_index, new_line_before, new_line_after) _CommentFormatting = NamedTuple( "CommentFormatting", [ ("line_prefix", str), ("block_prefix", str), ("block_suffix", str), ], ) def _code_comment_formatting(file_type: str) -> _CommentFormatting: if file_type == "python": return _CommentFormatting("#", "", "") elif file_type == "html" or file_type == "markdown": return _CommentFormatting("", "") elif file_type == "css" or file_type == "javascript": return _CommentFormatting("", "/*", "*/") elif file_type == "restructuredtext": return _CommentFormatting(" ", "..", "") raise ValueError(f"unsupported file_type given for code prefix suffix: {file_type}") def _file_type(file_path: str) -> str: if file_path.endswith(".py"): return "python" elif ( file_path.endswith(".js") or file_path.endswith(".jsx") or file_path.endswith(".ts") or file_path.endswith(".tsx") or file_path.endswith(".jss") ): return "javascript" elif file_path.endswith(".html"): return "html" elif file_path.endswith(".css"): return "css" elif file_path.endswith(".md"): return "markdown" elif file_path.endswith(".rst"): return "restructuredtext" return "unknown" def main(): args = parse_args() if args.command == QUALITY_COMMAND: quality(args.patterns) elif args.command == STYLE_COMMAND: style(args.patterns) else: raise ValueError(f"unknown command given: {args.command}") if __name__ == "__main__": main()