pax_global_header00006660000000000000000000000064151330144260014511gustar00rootroot0000000000000052 comment=ca9de6da80e2dab04f0da4c260afc09ae24bc1c1 dulwich-1.0.0/000077500000000000000000000000001513301442600131465ustar00rootroot00000000000000dulwich-1.0.0/.codespellrc000066400000000000000000000002131513301442600154420ustar00rootroot00000000000000[codespell] skip = .git,.mypy_cache,build,testdata ignore-words-list = fpr,claus,feld,nd,bu,ue,te,fo,afile,manuel,checkin,assertIn,rin,caf dulwich-1.0.0/.coveragerc000066400000000000000000000001351513301442600152660ustar00rootroot00000000000000[run] branch = True source = dulwich [report] exclude_lines = raise NotImplementedError dulwich-1.0.0/.deepsource.toml000066400000000000000000000002661513301442600162630ustar00rootroot00000000000000version = 1 test_patterns = ["dulwich/**test_*.py"] exclude_patterns = ["examples/**"] [[analyzers]] name = "python" enabled = true [analyzers.meta] runtime_version = "3.x.x"dulwich-1.0.0/.github/000077500000000000000000000000001513301442600145065ustar00rootroot00000000000000dulwich-1.0.0/.github/CODEOWNERS000066400000000000000000000001661513301442600161040ustar00rootroot00000000000000* @jelmer # Release robot dulwich/contrib/release_robot.py @mikofski dulwich/contrib/test_release_robot.py @mikofski dulwich-1.0.0/.github/FUNDING.yml000066400000000000000000000000171513301442600163210ustar00rootroot00000000000000github: jelmer dulwich-1.0.0/.github/dependabot.yml000066400000000000000000000017171513301442600173440ustar00rootroot00000000000000# Keep GitHub Actions up to date with GitHub's Dependabot... # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem --- version: 2 updates: - package-ecosystem: "cargo" directory: "/" schedule: interval: "monthly" rebase-strategy: "disabled" commit-message: prefix: "deps" include: "scope" groups: cargo: patterns: - "*" - package-ecosystem: "github-actions" directory: "/" schedule: interval: "monthly" commit-message: prefix: "ci" include: "scope" - package-ecosystem: "pip" directory: "/" schedule: interval: "monthly" commit-message: prefix: "deps" include: "scope" groups: pip: patterns: - "*" dulwich-1.0.0/.github/gpg-error-config000077500000000000000000000005601513301442600176040ustar00rootroot00000000000000#!/bin/sh # gpg-error-config: simple replacement gpg-error-config that is a shim # for pkg-config. # Parse flags for arg in "$@"; do case $arg in --cflags) pkg-config --cflags gpg-error ;; --libs) pkg-config --libs gpg-error ;; --version) pkg-config --modversion gpg-error ;; *) echo "Unknown option: $arg" >&2 exit 1 ;; esac done dulwich-1.0.0/.github/gpgme-config000077500000000000000000000007771513301442600170110ustar00rootroot00000000000000#!/bin/bash # Parse gpgme-config-like flags, then invoke `pkg-config gpgme`: # * Pass --cflags and --libs through # * Map --version to --modversion # * Ignore --thread=pthread # Parse flags for arg in "$@"; do case "$arg" in --cflags|--libs|--modversion) flags="$flags $arg" ;; --version) flags="$flags --modversion" ;; --thread=pthread) ;; --prefix) flags="$flags --variable=prefix" ;; *) echo "Unknown flag: $arg" >&2 exit 1 ;; esac done exec pkg-config gpgme $flags dulwich-1.0.0/.github/workflows/000077500000000000000000000000001513301442600165435ustar00rootroot00000000000000dulwich-1.0.0/.github/workflows/auto-merge.yml000066400000000000000000000012121513301442600213270ustar00rootroot00000000000000name: Dependabot auto-merge on: pull_request_target permissions: contents: read jobs: dependabot: runs-on: ubuntu-latest if: ${{ github.actor == 'dependabot[bot]' }} permissions: pull-requests: write contents: write steps: - name: Dependabot metadata id: metadata uses: dependabot/fetch-metadata@v2.4.0 with: github-token: "${{ secrets.GITHUB_TOKEN }}" - name: Enable auto-merge for Dependabot PRs run: gh pr merge --auto --squash "$PR_URL" env: PR_URL: ${{github.event.pull_request.html_url}} GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} dulwich-1.0.0/.github/workflows/disperse.yml000066400000000000000000000004011513301442600210770ustar00rootroot00000000000000--- name: Disperse configuration "on": push: branches: [ main, master ] permissions: contents: read jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6.0.1 - uses: jelmer/action-disperse-validate@v2.0.1 dulwich-1.0.0/.github/workflows/docs.yml000066400000000000000000000010401513301442600202110ustar00rootroot00000000000000--- name: API Docs "on": push: branches: [main, master] pull_request: schedule: - cron: "0 6 * * *" # Daily 6AM UTC build permissions: contents: read jobs: apidocs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6.0.1 - name: Set up Python uses: actions/setup-python@v6 with: python-version: "3.13" - name: Install pydoctor run: | pip3 install pydoctor - name: Generate docs run: make apidocs PYDOCTOR_ARGS=--warnings-as-errors dulwich-1.0.0/.github/workflows/python-distributions.yml000066400000000000000000000155251513301442600235170ustar00rootroot00000000000000--- name: Build Python distributions "on": push: branches: [main, master] tags: - 'dulwich-*' pull_request: schedule: - cron: "0 6 * * *" # Daily 6AM UTC build permissions: contents: read jobs: define-matrix: runs-on: ubuntu-latest outputs: matrix: ${{ steps.merged-identifiers.outputs.merged-identifiers }} steps: - uses: actions/checkout@v6.0.1 - uses: actions/setup-python@v6 with: python-version: 3.x cache: pip - name: Install jq run: sudo apt-get update && sudo apt-get install -y jq - name: Install cibuildwheel run: pip install cibuildwheel - name: Find build identifiers using cibuildwheel --print-build-identifiers id: all-build-identifiers run: | echo "linux=$(cibuildwheel --platform linux --print-build-identifiers | grep -v cp314 | tr '\n' ' ')" >> $GITHUB_OUTPUT echo "macos=$(cibuildwheel --platform macos --print-build-identifiers | grep -v cp314 | tr '\n' ' ')" >> $GITHUB_OUTPUT echo "windows=$(cibuildwheel --platform windows --print-build-identifiers | grep -v cp314 | tr '\n' ' ')" >> $GITHUB_OUTPUT - name: Select build identifiers id: select-build-identifiers run: | if [[ "$GITHUB_REF" = "refs/heads/main" ]] || [[ "$GITHUB_REF" = "refs/heads/master" ]] || [[ "$GITHUB_REF" = "refs/tags/"* ]]; then echo 'linux=${{ steps.all-build-identifiers.outputs.linux }}' >> $GITHUB_OUTPUT echo 'windows=${{ steps.all-build-identifiers.outputs.windows }}' >> $GITHUB_OUTPUT echo 'macos=${{ steps.all-build-identifiers.outputs.macos }}' >> $GITHUB_OUTPUT else echo "linux=$(echo -n '${{ steps.all-build-identifiers.outputs.linux }}' | awk '{print $NF}')" >> $GITHUB_OUTPUT echo "macos=$(echo -n '${{ steps.all-build-identifiers.outputs.macos }}' | awk '{print $NF}')" >> $GITHUB_OUTPUT echo "windows=$(echo -n '${{ steps.all-build-identifiers.outputs.windows }}' | awk '{print $NF}')" >> $GITHUB_OUTPUT fi - name: Output build identifiers id: json-identifiers run: | echo "linux=$(echo -n '${{ steps.select-build-identifiers.outputs.linux }}' | jq -R -s -c 'split(" ") | map(select(length > 0)) | [.[] | {os: "ubuntu-latest", "build-identifier": .}]')" >> $GITHUB_OUTPUT echo "macos=$(echo -n '${{ steps.select-build-identifiers.outputs.macos }}' | jq -R -s -c 'split(" ") | map(select(length > 0)) | [.[] | {os: "macos-latest", "build-identifier": .}]')" >> $GITHUB_OUTPUT echo "windows=$(echo -n '${{ steps.select-build-identifiers.outputs.windows }}' | jq -R -s -c 'split(" ") | map(select(length > 0)) | [.[] | {os: "windows-latest", "build-identifier": .}]')" >> $GITHUB_OUTPUT - name: Merge build identifiers id: merged-identifiers run: | echo merged-identifiers=$(echo -n '${{ steps.json-identifiers.outputs.linux }} ${{ steps.json-identifiers.outputs.macos }} ${{ steps.json-identifiers.outputs.windows }}' | jq -c -s 'add') >> $GITHUB_OUTPUT build-wheels: runs-on: ${{ matrix.os }} needs: define-matrix strategy: matrix: include: ${{ fromJSON(needs.define-matrix.outputs.matrix ) }} fail-fast: true steps: - uses: actions/checkout@v6.0.1 - uses: actions/setup-python@v6 with: cache: pip - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools wheel cibuildwheel setuptools-rust - name: Set up QEMU uses: docker/setup-qemu-action@v3.7.0 if: "matrix.os == 'ubuntu-latest'" - name: Build wheels run: python -m cibuildwheel --output-dir wheelhouse env: CIBW_BUILD: "${{ matrix.build-identifier }}*" - name: Upload wheels uses: actions/upload-artifact@v6.0.0 with: name: artifact-${{ matrix.build-identifier }} path: ./wheelhouse/*.whl build-android-wheels: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6.0.1 - uses: actions/setup-python@v6 with: cache: pip - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools wheel cibuildwheel setuptools-rust - name: Build Android wheels run: python -m cibuildwheel --output-dir wheelhouse env: CIBW_PLATFORM: android CIBW_ARCHS_ANDROID: arm64_v8a x86_64 - name: Upload Android wheels uses: actions/upload-artifact@v6.0.0 with: name: artifact-android path: ./wheelhouse/*.whl build-pure-wheels: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6.0.1 - uses: actions/setup-python@v6 with: cache: pip - run: pip install build - run: PURE=true python -m build --wheel - name: Upload pure wheels uses: actions/upload-artifact@v6.0.0 with: name: artifact-pure path: ./dist/*.whl build-sdist: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6.0.1 - uses: actions/setup-python@v6 with: cache: pip - name: Install dependencies run: | python -m pip install --upgrade pip pip install build - name: Build sdist run: python -m build --sdist - name: Upload sdist uses: actions/upload-artifact@v6.0.0 with: name: artifact-source path: ./dist/*.tar.gz test-sdist: needs: - build-sdist runs-on: ubuntu-latest steps: - uses: actions/setup-python@v6 with: cache: pip - name: Install dependencies run: | python -m pip install --upgrade pip # Upgrade packging to avoid a bug in twine. # See https://github.com/pypa/twine/issues/1216 pip install "twine>=6.1.0" "packaging>=24.2" - name: Download sdist uses: actions/download-artifact@v7.0.0 with: name: artifact-source path: dist - name: Test sdist run: twine check dist/* - name: Test installation from sdist run: pip install dist/*.tar.gz publish: runs-on: ubuntu-latest needs: - build-wheels - build-android-wheels - build-sdist - build-pure-wheels if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/dulwich-') permissions: id-token: write environment: name: pypi url: https://pypi.org/p/dulwich steps: - name: Download distributions uses: actions/download-artifact@v7.0.0 with: merge-multiple: true pattern: artifact-* path: dist - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@v1.13.0 dulwich-1.0.0/.github/workflows/pythontest.yml000066400000000000000000000050641513301442600215140ustar00rootroot00000000000000name: Python tests on: push: branches: [ main, master ] pull_request: schedule: - cron: "0 6 * * *" # Daily 6AM UTC build permissions: contents: read jobs: test: runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] fail-fast: false steps: - uses: actions/checkout@v6.0.1 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} allow-prereleases: true cache: pip - name: Install native dependencies (Ubuntu) run: sudo apt-get update && sudo apt-get install -y libgpgme-dev libgpg-error-dev if: "matrix.os == 'ubuntu-latest'" - name: Provide gpgme-config and gpg-error-config if: "matrix.os == 'ubuntu-latest'" run: | mkdir -p "$HOME/.local/bin" cp .github/gpgme-config "$HOME/.local/bin/gpgme-config" cp .github/gpg-error-config "$HOME/.local/bin/gpg-error-config" echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Install native dependencies (MacOS) run: brew install swig gpgme if: "matrix.os == 'macos-latest'" - name: Install dependencies run: | # Install build-time dependencies python -m pip install --upgrade "setuptools>=77" python -m pip install --upgrade pip pip install --upgrade ".[merge,fastimport,paramiko,https,patiencediff,colordiff]" setuptools-rust - name: Install gpg on supported platforms run: pip install --upgrade ".[pgp]" if: "matrix.os != 'windows-latest' && matrix.python-version != 'pypy3'" - name: Style checks run: | pip install ".[dev]" python -m ruff check . python -m ruff format --check . - name: Typing checks run: | pip install --upgrade types-paramiko types-requests python -m mypy dulwich if: "matrix.python-version != 'pypy3'" - name: Build run: | python setup.py build_ext -i env: RUSTFLAGS: "-D warnings" - name: Run Rust tests run: | cargo test env: RUSTFLAGS: "-D warnings" - name: codespell run: | codespell --config .codespellrc . - name: Coverage test suite run run: | pip install --upgrade coverage python -m coverage run -p -m unittest tests.test_suite dulwich-1.0.0/.gitignore000066400000000000000000000005511513301442600151370ustar00rootroot00000000000000_trial_temp build build-pypy MANIFEST dist apidocs *,cover .testrepository *.pyc *.pyd *.pyo *.so *~ *.swp *.swh *.swn *.swo docs/tutorial/index.html dulwich.egg-info/ .tox/ .idea/ .coverage htmlcov/ docs/api/*.txt .mypy_cache/ .eggs dulwich.dist-info .stestr target/ # Files created by OSS-Fuzz when running locally fuzz_*.pkg.spec .claude/settings.local.json dulwich-1.0.0/.mailmap000066400000000000000000000024321513301442600145700ustar00rootroot00000000000000Jelmer Vernooij Jelmer Vernooij Jelmer Vernooij Jelmer Vernooij Jelmer Vernooij Jelmer Vernooij Martin Dave Borowitz Dave Borowitz John Carr Mark Mikofski Mark Mikofski David Carr Jon Bain James Westby David Keijser Benoît HERVIER Ryan Faulkner David Bennett Risto Kankkunen Augie Fackler Damien Tournoud Marcin Kuźmiński dulwich-1.0.0/.readthedocs.yaml000066400000000000000000000003011513301442600163670ustar00rootroot00000000000000version: 2 build: os: ubuntu-22.04 tools: python: "3.11" sphinx: configuration: docs/conf.py formats: - pdf - epub python: install: - requirements: docs/requirements.txt dulwich-1.0.0/.stestr.conf000066400000000000000000000000321513301442600154120ustar00rootroot00000000000000[DEFAULT] test_path=tests dulwich-1.0.0/.testr.conf000066400000000000000000000002321513301442600152310ustar00rootroot00000000000000[DEFAULT] test_command=PYTHONPATH=. python3 -m subunit.run $IDOPTION $LISTOPT tests.test_suite test_id_option=--load-list $IDFILE test_list_option=--list dulwich-1.0.0/CODE_OF_CONDUCT.md000066400000000000000000000064311513301442600157510ustar00rootroot00000000000000# Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socioeconomic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: - Using welcoming and inclusive language - Being respectful of differing viewpoints and experiences - Gracefully accepting constructive criticism - Focusing on what is best for the community - Showing empathy towards other community members Examples of unacceptable behavior by participants include: - The use of sexualized language or imagery and unwelcome sexual attention or advances - Trolling, insulting/derogatory comments, and personal or political attacks - Public or private harassment - Publishing others' private information, such as a physical or electronic address, without explicit permission - Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at team@dulwich.io. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see dulwich-1.0.0/CONTRIBUTING.rst000066400000000000000000000154551513301442600156210ustar00rootroot00000000000000All functionality should be available in pure Python. Optional Rust implementations may be written for performance reasons, but should never replace the Python implementation. Where possible include updates to NEWS along with your improvements. New functionality and bug fixes should be accompanied by matching unit tests. Installing development dependencies ----------------------------------- Contributing to Dulwich requires several more dependencies than are required to install the base package; they are used to run tests and various other checks. First, make sure your system has the Rust compiler and Cargo (the Rust package manager) installed. As this is a system-level requirement and not a Python library, the right way to install it depends on your platform. Please consult the `Rust documentation `__ to find out more. Next, you will need to set up your Python environment for Dulwich. An easy way to get started is to install the checked out Dulwich package in editable mode with ``dev`` extras, preferably in a new virtual environment: .. code:: console $ cd ~/path/to/checkouts/dulwich # Create and activate a virtual environment via your favourite method, such as pyenv, # uv, built-in venv module... $ python -m venv .venv && . .venv/bin/activate # Now install Dulwich and the required dependencies $ pip install -e ".[dev]" This will ensure the tools needed to test your changes are installed. It is not necessary to install Dulwich in editable mode (``-e``), but doing so is convenient for development, as code changes will be visible immediately, without requiring a reinstall (although any running Python processes will need to be reloaded to see the updated module definitions). Editable mode only applies to Python code; if you modify any of the Rust extension code, you will need to reinstall the package for the extensions to be recompiled. There are also other, optional dependencies which are needed to run the full test suite, implement optional features, and provide the full typing information. They are however not strictly necessary; the above is sufficient to start developing and have your PR pass the tests in most cases. Please consult the ``[project.optional-dependencies]`` section in ``pyproject.toml``. Coding style ------------ The code follows the PEP8 coding style. There are ``ruff`` rules in place that define the exact code style, please run it to make sure your changes are conformant. See also "Style and typing checks" below for details on running style checkers. Public methods, functions and classes should all have doc strings. Please use Google style docstrings to document parameters and return values. You can generate the documentation by running ``pydoctor --docformat=google dulwich`` from the root of the repository, and then opening ``apidocs/index.html`` in your web browser. String Types ~~~~~~~~~~~~ Like Linux, Git treats filenames as arbitrary bytestrings. There is no prescribed encoding for these strings, and although it is fairly common to use UTF-8, any raw byte strings are supported. For this reason, the lower levels in Dulwich treat git-based filenames as bytestrings. It is up to the Dulwich API user to encode and decode them if necessary. The porcelain may accept unicode strings and convert them to bytestrings as necessary on the fly (using 'utf-8'). * on-disk filenames: regular strings, or ideally, pathlib.Path instances * git-repository related filenames: bytes * object sha1 digests (20 bytes long): bytes * object sha1 hexdigests (40 bytes long): str (bytestrings on python2, strings on python3) Exceptions ~~~~~~~~~~ When catching exceptions, please catch the specific exception type rather than a more generic type (like OSError, IOError, Exception, etc.). This will ensure that you do not accidentally catch unrelated exceptions. The only exception is when you are reraising an exception, e.g. when re-raising an exception after logging it. Do not catch bare except, although ruff will warn you about this. Keep the code within a try/except block as small as possible, so that you do not accidentally catch unrelated exceptions. Deprecating functionality ~~~~~~~~~~~~~~~~~~~~~~~~~ Dulwich uses the `dissolve` package to manage deprecations. If you want to deprecate functionality, please use the `@replace_me` decorator from the root of the dulwich package. This will ensure that the deprecation is handled correctly: * It will be logged as a warning * When the version of Dulwich is bumped, the deprecation will be removed * Users can use `dissolve migrate` to automatically replace deprecated functionality in their code Tests ~~~~~ Dulwich has two kinds of tests: * Unit tests, which test individual functions and classes * Compatibility tests, which test that Dulwich behaves in a way that is compatible with C Git The former should never invoke C Git, while the latter may do so. This is to ensure that it is possible to run the unit tests in an environment where C Git is not available. Tests should not depend on the internet, or any other external services. Avoid using mocks if at all possible; rather, design your code to be easily testable without them. If you do need to use mocks, please use the ``unittest.mock`` module. Running the tests ----------------- To run the testsuite, you should be able to run ``dulwich.tests.test_suite``. This will run the tests using unittest. .. code:: console $ python -m unittest dulwich.tests.test_suite The compatibility tests that verify Dulwich behaves in a way that is compatible with C Git are the slowest, so you may want to avoid them while developing: .. code:: console $ python -m unittest dulwich.tests.nocompat_test_suite testr and tox configuration is also present. Style and typing checks ----------------------- Several static analysis tools are used to ensure code quality and consistency. * Use ``ruff check`` to run all style-related checks. * Use ``ruff format --check`` to check code formatting. * Use ``mypy dulwich`` for typing checks. * Use ``codespell`` to check for common misspellings. Those checks are *mandatory*, a PR will not pass tests and will not be merged if they aren't successful. .. code:: console $ ruff check $ ruff format --check $ mypy dulwich $ codespell In some cases you can automatically fix issues found by these tools. To do so, you can run: .. code:: console $ ruff check --fix # or pass --unsafe-fixes to apply more aggressive fixes $ ruff format $ codespell --config .codespellrc -w Merge requests -------------- Please either send pull requests to the maintainer (jelmer@jelmer.uk) or create new pull requests on GitHub. Licensing --------- All contributions should be made under the same license that Dulwich itself comes under: both Apache License, version 2.0 or later and GNU General Public License, version 2.0 or later. dulwich-1.0.0/COPYING000066400000000000000000000716051513301442600142120ustar00rootroot00000000000000Dulwich may be used under the conditions of either of two licenses, the Apache License (version 2.0 or later) or the GNU General Public License, version 2.0 or later. SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. dulwich-1.0.0/Cargo.lock000066400000000000000000000116671513301442600150660ustar00rootroot00000000000000# This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "diff-tree-py" version = "0.25.2" dependencies = [ "pyo3", ] [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "indoc" version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" dependencies = [ "rustversion", ] [[package]] name = "libc" version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "memchr" version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "memoffset" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" dependencies = [ "autocfg", ] [[package]] name = "objects-py" version = "0.25.2" dependencies = [ "memchr", "pyo3", ] [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "pack-py" version = "0.25.2" dependencies = [ "memchr", "pyo3", "similar", ] [[package]] name = "portable-atomic" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" [[package]] name = "proc-macro2" version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7" dependencies = [ "unicode-ident", ] [[package]] name = "pyo3" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab53c047fcd1a1d2a8820fe84f05d6be69e9526be40cb03b73f86b6b03e6d87d" dependencies = [ "indoc", "libc", "memoffset", "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", "unindent", ] [[package]] name = "pyo3-build-config" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b455933107de8642b4487ed26d912c2d899dec6114884214a0b3bb3be9261ea6" dependencies = [ "target-lexicon", ] [[package]] name = "pyo3-ffi" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c85c9cbfaddf651b1221594209aed57e9e5cff63c4d11d1feead529b872a089" dependencies = [ "libc", "pyo3-build-config", ] [[package]] name = "pyo3-macros" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a5b10c9bf9888125d917fb4d2ca2d25c8df94c7ab5a52e13313a07e050a3b02" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", "syn", ] [[package]] name = "pyo3-macros-backend" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03b51720d314836e53327f5871d4c0cfb4fb37cc2c4a11cc71907a86342c40f9" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", "syn", ] [[package]] name = "quote" version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a" dependencies = [ "proc-macro2", ] [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "similar" version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" [[package]] name = "syn" version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "target-lexicon" version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba" [[package]] name = "unicode-ident" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unindent" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" dulwich-1.0.0/Cargo.toml000066400000000000000000000002121513301442600150710ustar00rootroot00000000000000[workspace] members = ["crates/*"] resolver = "2" [workspace.dependencies] pyo3 = ">=0.25, <0.28" [workspace.package] version = "1.0.0" dulwich-1.0.0/MANIFEST.in000066400000000000000000000006441513301442600147100ustar00rootroot00000000000000include NEWS include README.rst include Makefile include COPYING include CONTRIBUTING.rst include TODO include dulwich/contrib/README.swift.rst include dulwich/py.typed recursive-include docs conf.py *.txt Makefile make.bat recursive-include examples *.py recursive-include crates *.rs Cargo.toml graft tests/ graft testdata/ include tox.ini include dulwich.cfg include .testr.conf include Cargo.toml include Cargo.lock dulwich-1.0.0/Makefile000066400000000000000000000026661513301442600146200ustar00rootroot00000000000000PYTHON = python3 RUFF ?= $(PYTHON) -m ruff SETUP = $(PYTHON) setup.py TESTRUNNER ?= unittest RUNTEST = PYTHONHASHSEED=random PYTHONPATH=$(shell pwd)$(if $(PYTHONPATH),:$(PYTHONPATH),) $(PYTHON) -m $(TESTRUNNER) $(TEST_OPTIONS) COVERAGE = python3-coverage PYDOCTOR_ARGS ?= DESTDIR=/ all: build doc:: sphinx sphinx:: $(MAKE) -C docs html build:: $(SETUP) build $(SETUP) build_ext -i install:: $(SETUP) install --root="$(DESTDIR)" check:: build $(RUNTEST) tests.test_suite check-tutorial:: build $(RUNTEST) tests.tutorial_test_suite check-nocompat:: build $(RUNTEST) tests.nocompat_test_suite check-compat:: build $(RUNTEST) tests.compat_test_suite check-pypy:: clean $(MAKE) check-noextensions PYTHON=pypy check-noextensions:: clean $(RUNTEST) tests.test_suite check-contrib:: clean $(RUNTEST) -v dulwich.contrib.test_suite typing: $(PYTHON) -m mypy dulwich clean:: $(SETUP) clean --all rm -f dulwich/*.so style: $(RUFF) check . coverage: $(COVERAGE) run -m unittest tests.test_suite dulwich.contrib.test_suite coverage-html: coverage $(COVERAGE) html .PHONY: apidocs apidocs: $(PYTHON) -m pydoctor $(PYDOCTOR_ARGS) --intersphinx https://www.dulwich.io/api/objects.inv --intersphinx http://docs.python.org/3/objects.inv --docformat=google dulwich --project-url=https://www.dulwich.io/ --project-name=dulwich fix: ruff check --fix . reformat: ruff format . .PHONY: codespell codespell: codespell --config .codespellrc . dulwich-1.0.0/NEWS000066400000000000000000003173201513301442600136530ustar00rootroot000000000000001.0.0 2026-01-17 * Release of 1.0! From here on, Dulwich will not break backwards compatibility until 2.0 - although we may print ``DeprecationWarning`` when using deprecated functionality. Micro releases (1.x.y) will be reserved for important bugfixes. Major releases (1.x.0) will introduced new features and functionality, without breaking backwards compatibility. (Jelmer Vernooij, #2007) 0.25.2 2026-01-11 * Fix test failure when GPG raises ``InvalidSigners`` instead of ``GPGMEError`` on systems without usable secret keys. (#2063) * Object store ``close()`` methods can now be called multiple times safely. Object stores now raise ``ResourceWarning`` when destroyed with unclosed resources. (Jelmer Vernooij) 0.25.1 2026-01-08 * Add signature vendor system for signing and verifying Git objects. Supports GPG, SSH signatures, and X.509 certificates. (Jelmer Vernooij) * Add support for partial clone with object filtering. (Jelmer Vernooij) * Fix sparse pattern matching in worktree operations. (Jelmer Vernooij) * ``dulwich.porcelain.status`` now returns regular strings. (Jelmer Vernooij, #889) * Fix AssertionError when accessing ref names with length matching binary hash length (e.g., 32 bytes for SHA-256). (Jelmer Vernooij, #2040) * Fix commit graph parsing failure when processing commits with 3+ parents (octopus merges) with incomplete EXTRA_EDGE_LIST chunk data. (Jelmer Vernooij, #2054) * Add ``parse_commit_broken`` function to parse broken commits. (Valentin Lorentz, Jelmer Vernooij) * Add basic ``dulwich.aiohttp`` module that provides server support. (Jelmer Vernooij) * Add callback-based authentication support for HTTP and proxy authentication in ``Urllib3HttpGitClient``. This allows applications to handle authentication dynamically without intercepting exceptions. Callbacks receive the authentication scheme information (via WWW-Authenticate or Proxy-Authenticate headers) and can provide credentials or cancel. (Jelmer Vernooij, #822) 0.25.0 2025-12-17 **PLEASE NOTE**: This release makes quite a lot of changes to public APIs. This is ahead of a 1.0 release, after which API changes will be kept backwards compatible. * Split out worktree module from porcelain into separate ``dulwich.worktree`` module for better code organization. (Jelmer Vernooij, #2037) * Split porcelain module into separate submodules: ``dulwich.porcelain.tags``, ``dulwich.porcelain.notes``, ``dulwich.porcelain.submodule``, and ``dulwich.porcelain.lfs``. Main porcelain module re-exports all functions for backward compatibility. (Jelmer Vernooij, #2032) * Ensure ``dulwich.porcelain`` package is properly installed as a directory structure with submodules. (Jelmer Vernooij, #2035) * Add tests for consistent license preamble across codebase and prevent ``os.environ`` usage in lower layers. (Jelmer Vernooij, #2033) * Add ``__all__`` exports to all modules for better API clarity and wildcard import support. (Jelmer Vernooij, #2022) * Fix ParamikoSSHVendor interface compatibility with SSHVendor. (Jelmer Vernooij, #2028) * Add fallback when HEAD is missing in dumb HTTP protocol, improving compatibility with repositories that don't have a HEAD reference. (Antoine Lambert, #2030) * Fix smudge filter subprocess fallback for special characters in path. (Petr Chmelar, #1878) * Fix UTF-8 decode error in process filter protocol when handling binary files. (Jelmer Vernooij, #2023) * Fix ``porcelain.add()`` to correctly handle ``None`` values in pathspec parameter. (Jelmer Vernooij, #2027) * Add ``--stat`` argument to ``dulwich diff`` command to display diffstat summary showing files changed and line additions/deletions. (Jelmer Vernooij, #2026) * Avoid signing commits in ``porcelain.stash()`` operations to prevent GPG prompt interruptions during automated stashing. (Jelmer Vernooij, #2012) * Improve error handling when trying to remove non-empty directories during worktree operations. (Jelmer Vernooij, #2004) * Move greenthreads support to dulwich/contrib. This code isn't really developed and only used by the swift support. (Jelmer Vernooij) * Move protocol-level peeled tags functions (``serialize_refs()``, ``write_info_refs()``, ``split_peeled_refs()``, ``strip_peeled_refs()``) from ``dulwich.refs`` to ``dulwich.protocol``. The ``^{}`` peeled tags syntax is now properly isolated to protocol-level code. Remove ``InfoRefsContainer`` class (functionality inlined into ``SwiftInfoRefsContainer``). (Jelmer Vernooij, #2009) * Fix ``get_unstaged_changes()`` to correctly pass Blob objects to filter callbacks instead of raw bytes. This fixes crashes when using ``.gitattributes`` files with filter callbacks like ``checkin_normalize``. (Jelmer Vernooij, #2010) * The ``ObjectID`` and ``Ref`` types are now newtypes, making it harder to accidentally pass the wrong type - as notified by mypy. Most of this is in lower-level code. (Jelmer Vernooij) * Implement support for ``core.sharedRepository`` configuration option. Repository files and directories now respect shared repository permissions for group-writable or world-writable repositories. Affects loose objects, pack files, pack indexes, index files, and other git metadata files. (Jelmer Vernooij, #1804) * Optimize status performance by using stat matching to skip reading and filtering unchanged files. This provides significant performance improvements for repositories with LFS filters, where filter operations can be very expensive. The optimization matches Git's behavior of using mtime and size comparisons to determine if files need processing. File entries now use nanosecond-resolution timestamps for more accurate change detection. (Jelmer Vernooij, #1999, #2013) * Add support for multi-pack index (MIDX) files for improved performance with multiple pack files. Supports reading and writing MIDX files, including mmap support for efficient loading. Enables faster object lookups across multiple packs. (Jelmer Vernooij, #1998) * Implement ``git restore`` and ``git switch`` commands with corresponding porcelain functions. The ``restore`` command allows restoring files from commits or the index, while ``switch`` provides branch switching functionality. (Jelmer Vernooij, #2003) * Add support for ``core.protectHFS`` configuration option to protect against HFS+ filesystem vulnerabilities. (Jelmer Vernooij) * Skip tests that require the merge3 module when it is not available, improving test compatibility across different Python environments. (Jelmer Vernooij, #2002) * Drop support for Python 3.9. (Jelmer Vernooij) * Add support for pack bitmap indexes for fast reachability queries. (Jelmer Vernooij, #1792) * Add support for ``git rerere`` (reuse recorded resolution) with CLI subcommands and porcelain functions. Supports ``rerere.enabled`` and ``rerere.autoupdate`` configuration. (Jelmer Vernooij, #1786) * Add support for ``git mailinfo`` command to extract patch information from email messages. Implements ``dulwich mailinfo`` CLI command, ``porcelain.mailinfo()``, and ``patch.mailinfo()`` with support for subject munging, -k/-b flags, --scissors, --encoding, and --message-id options. (Jelmer Vernooij, #1839) * Add support for column formatting. (Jelmer Vernooij, #1837) * Add ``dulwich diagnose`` command to display diagnostic information about the Python environment including Python version, PYTHONPATH, sys.path, Dulwich version, and installed dependencies with their versions. (Jelmer Vernooij, #1835) * Add support for SHA256 repositories. Dulwich can now read and write Git repositories using SHA256 object format. This includes support for loose objects, pack files (v1 and v2 indexes), tree parsing with SHA256 hashes, pack bitmap indexes, commit graphs, and network protocol operations (clone, fetch, push). The Rust extensions have been updated to support variable hash lengths. SHA256 repositories require format version 1 and the objectFormat extension. The ``dulwich init`` command now supports ``--objectformat`` option to create SHA256 repositories. Client and server implementations advertise and negotiate object-format capabilities. (Jelmer Vernooij, #1115, #1604) 0.24.10 2025-11-10 * Fix compatibility with python 3.9. (Jelmer Vernooij, #1991) 0.24.9 2025-11-10 * Fix passing key_filename and ssh_command parameters to SSHGitClient. (Saugat Pachhai) * Relax check to support subclasses of Urllib3HttpGitClient. Fixes regression from 0.24.2 where subclasses of Urllib3HttpGitClient would not receive the config object. (Saugat Pachhai) * Fix test_concurrent_ref_operations_compatibility test flakiness. (Jelmer Vernooij) * Fix warnings in test suite. (Jelmer Vernooij) 0.24.8 2025-10-29 * Add Rust implementation of pack delta creation (create_delta). The implementation uses the similar crate for efficient diff computation. (Jelmer Vernooij) * Extend ``http.extraHeader`` configuration to support per-URL settings. Allows configuring different HTTP headers for specific URLs using ``http..extraHeader`` syntax, enabling authentication in CI/CD environments like GitHub Actions. More specific URL configurations override less specific ones. (Jelmer Vernooij, #882) * Add support for ``GIT_REFLOG_ACTION`` environment variable in porcelain functions. (Jelmer Vernooij, #1811) * Add support for namespace isolation via ``NamespacedRefsContainer``. Implements Git's namespace feature for isolating refs within a single repository using the ``refs/namespaces/`` prefix. (Jelmer Vernooij, #1809) * Add support for GIT_FLUSH environment variable to control output buffering in CLI commands. When GIT_FLUSH=1, output is flushed after each write for real-time visibility. (Jelmer Vernooij, #1810) * Implement ``dulwich interpret-trailers`` functionality to parse and manipulate structured metadata (trailers) in commit messages. Adds ``porcelain.interpret_trailers()`` with support for parsing, adding, replacing, and formatting trailers. Also fixes the ``signoff`` parameter in ``porcelain.commit()`` to add ``Signed-off-by`` trailers. (Jelmer Vernooij, #1826) * Add support for recursive submodule updates via ``--recursive`` flag in ``dulwich submodule update`` command and ``recursive`` parameter in ``porcelain.submodule_update()``. (Jelmer Vernooij, #1813) * Add support for ``git maintenance`` command to optimize Git repository data. Implements gc, commit-graph, loose-objects, incremental-repack, pack-refs, and prefetch tasks. Supports automatic maintenance with ``--auto`` flag and task-specific configuration. (Jelmer Vernooij) * Add support for ``dulwich replace`` command to create refs that replace objects. (Jelmer Vernooij, #1834) * Implement advanced Git object specification support: index path lookup (``:``, ``:0:``, ``:1:``, ``:2:``, ``:3:``) for accessing files from the index and merge stages, and reflog time specifications (``@{time}``) using Git's approxidate format (e.g., ``HEAD@{yesterday}`, ``master@{2.weeks.ago}``). (Jelmer Vernooij, #1783) * Add ``dulwich stripspace`` command to remove unnecessary whitespace from text. (Jelmer Vernooij, #1838) 0.24.7 2025-10-23 * Add sparse index support for improved performance with large repositories. Implements reading and writing of sparse directory entries, index expansion/ collapse operations, and the 'sdir' extension. (Jelmer Vernooij, #1797) * Add support for core.fsyncObjectFiles configuration option. (Jelmer Vernooij, #1817) * Work around typing module bug in Python 3.9.0 and 3.9.1 by using string annotation for Callable type in reflog.py. (Jelmer Vernooij, #1948) * Fix passing ssh_command, password, and key_filename parameters to the SSH vendor. Regression from 0.24.2. (Jelmer Vernooij, #1945) * Fix LFS URL validation to prevent DNS resolution errors when ``lfs.url`` is configured with an invalid value. Implement full support for ``file://`` URLs to access local LFS repositories, matching git-lfs behavior. (Jelmer Vernooij, #1951) 0.24.6 2025-10-19 * Fix import failure when ``sys.stdin`` is ``None``. The ``dulwich.server`` module can now be imported in environments where ``sys.stdin`` is ``None``, such as Windows GUI apps, apps started with ``pythonw``, or apps using ``ProcessPoolExecutor``. (Jelmer Vernooij, #1939) * Add support for ``dulwich reflog expire`` and ``dulwich reflog delete`` commands. (Jelmer Vernooij, #1798) * Add ``dulwich grep`` command. Supports regular expressions, case-insensitive search, line numbers, pathspec filtering, and respecting .gitignore patterns. (Jelmer Vernooij, #1776) * Add support for octopus merge strategy. (Jelmer Vernooij, #1816) * Add support for ``git show-branch`` command to display branches and their * Add ``dulwich cherry`` command to find commits not merged upstream. Compares commits by patch ID to identify equivalent patches regardless of commit metadata. Supports automatic upstream detection from tracking branches and verbose mode to display commit messages. (Jelmer Vernooij, #1782) * Add support for ``dulwich mailsplit`` command to split mbox files and Maildir into individual message files. Supports mboxrd format, custom precision, and all standard git mailsplit options. (Jelmer Vernooij, #1840) * Implement recursive merge strategy for handling multiple merge bases (criss-cross merges). When multiple common ancestors exist, the algorithm creates a virtual merge base by recursively merging them, reducing false conflicts in complex merge scenarios. The recursive strategy is now used automatically by ``porcelain.merge()``. (Jelmer Vernooij, #1815) * Add support for ``dulwich show-branch`` command to display branches and their commits. Supports filtering by local/remote branches, topological ordering, list mode, independent branch detection, and merge base calculation. (Jelmer Vernooij, #1829) * Add ``dulwich config`` command to get and set repository or global configuration options. Supports getting/setting values, listing all config, getting all values for multivars, and unsetting values. (Jelmer Vernooij, #1775) 0.24.5 2025-10-15 * Add support for ``dulwich show-ref`` command to list references in a local repository. Supports filtering by branches/tags, pattern matching, dereferencing tags, verification mode, and existence checking. Available as ``porcelain.show_ref()`` and ``dulwich show-ref`` CLI command. (Jelmer Vernooij, #1830) * Fix HTTP authentication to preserve credentials from URLs when storing remote configuration. URLs with embedded credentials (like ``https://token@github.com/user/repo.git``) now correctly save those credentials to git config, allowing subsequent push operations to succeed. (Jelmer Vernooij, #1925) * Restore ``pool_manager`` parameter to ``get_transport_and_path`` and ``get_transport_and_path_from_url`` functions that was accidentally removed during type annotation refactoring. (Jelmer Vernooij, #1928) 0.24.4 2025-10-14 * Add compatibility for Python 3.14. (Jelmer Vernooij) * Re-enable android build. (Malcolm Smith) 0.24.3 2025-10-12 * Add ``dulwich merge-base`` command. (Jelmer Vernooij, #1831) * Add support for ``dulwich var`` command to display Git's logical variables (GIT_AUTHOR_IDENT, GIT_COMMITTER_IDENT, GIT_EDITOR, GIT_SEQUENCE_EDITOR, GIT_PAGER, GIT_DEFAULT_BRANCH). Available as ``porcelain.var()`` and ``dulwich var`` CLI command. (Jelmer Vernooij, #1841) * Add support for ``GIT_TRACE`` environment variable for debugging. Supports output to stderr (values "1", "2", or "true"), file descriptors (3-9), file paths, and directories (creates per-process trace files). (Jelmer Vernooij, #1863) * Add ``extract_signature()`` method to ``Commit`` and ``Tag`` classes that returns (payload, signature, signature_type) tuple. Supports both PGP and SSH signature detection. (Jelmer Vernooij) * Fix Git filter protocol implementation to properly handle the two-phase response format (initial headers, content, final headers) as specified in the Git protocol documentation. This fixes compatibility with Git LFS and other filters that send status messages in final headers. (Jelmer Vernooij, #1889) * Add ``dulwich worktree repair`` command to repair worktree administrative files after worktrees or the main repository have been moved. (Jelmer Vernooij, #1799) * Add ``dulwich verify-tag`` command to check GPG signatures of tags. (Jelmer Vernooij, #1833) * Add ``dulwich verify-commit`` command to check GPG signatures of tags. (Jelmer Vernooij, #1832) 0.24.2 2025-09-25 * Added ``porcelain.shortlog`` function to summarize commits by author, similar to git shortlog. (Muhammad Usama, #1693) * Fix merge functionality to gracefully handle missing optional merge3 dependency by raising informative ImportError with installation instructions. (Jelmer Vernooij, #1759) * Fix worktree CLI tests to properly change to repository directory. (Jelmer Vernooij, #1738) * Add ``temporary_worktree`` context manager for creating temporary worktrees that are automatically cleaned up. (Jelmer Vernooij) * Add ``exist_ok`` parameter to ``add_worktree`` to allow creation with existing directories. (Jelmer Vernooij) * Add colorized diff support for the ``show`` command with ``--color`` argument. (Jelmer Vernooij, #1741) * Fix Windows path handling in ``_ensure_parent_dir_exists`` to correctly handle drive letters during checkout operations. (Jelmer Vernooij, #1751) * Fix Windows config loading to only use current Git config path, avoiding loading older config files. (Jelmer Vernooij, #1732) * Add interactive rebase support with ``dulwich rebase -i``, including support for pick, reword, edit, squash, fixup, drop, exec, and break commands. (Jelmer Vernooij, #1696) * Fix handling of CRLF line endings with ``core.autocrlf = input`` to prevent unchanged files from appearing as unstaged in status. (Jelmer Vernooij, #1773) * Add support for ``core.whitespace`` configuration for whitespace error detection and fixing. Supports blank-at-eol, space-before-tab, indent-with-non-tab, tab-in-indent, blank-at-eof, cr-at-eol, and tabwidth settings. (Jelmer Vernooij, #1806) * Add support for ``core.safecrlf`` configuration to check if CRLF/LF conversions would be reversible and optionally abort or warn on potentially lossy conversions. (Jelmer Vernooij, #1806) * Add support for ``http.extraHeader`` configuration to pass additional HTTP headers to the server when communicating over HTTP(S). (Jelmer Vernooij, #1769) * Optimize LFS filter performance by avoiding redundant disk writes when checking file status. The LFS store now checks if objects already exist before writing them to disk, significantly improving ``dulwich status`` performance in repositories with many LFS-tracked files. (Jelmer Vernooij, #1789) * Add filter server support. (Jelmer Vernooij, #1789) * Add support for ``patiencediff`` algorithm in diff. (Jelmer Vernooij, #1795) * Add IPv6 support for git:// protocol URLs. (Jelmer Vernooij, #1796) * Add support for ``core.preloadIndex`` configuration setting to enable parallel stat operations when checking for unstaged changes. This improves performance on slow filesystems like NFS. (Jelmer Vernooij, #1851) 0.24.1 2025-08-01 * Require ``typing_extensions`` on Python 3.10. (Jelmer Vernooij, #1735) 0.24.0 2025-08-01 * Split out ``WorkTree`` from ``Repo``. (Jelmer Vernooij) * Add comprehensive git worktree support including ``WorkTreeContainer`` class, ``WorkTreeInfo`` objects, and full CLI/porcelain implementations for add, list, remove, prune, lock, unlock, and move operations. (Jelmer Vernooij, #1710, #1632) * Add support for ``-a`` argument to ``dulwich.cli.commit``. (Jelmer Vernooij) * Add support for ``--amend`` argument to ``dulwich.cli.commit`` and ``dulwich.porcelain.commit``. (Jelmer Vernooij) * Add support for merge drivers. (Jelmer Vernooij) * Add support for Git revision syntax operators ``~``, ``^``, ``^{}``, ``@{N}``, and ``:path`` in ``dulwich.objectspec.parse_object``, e.g. ``HEAD~1``, ``HEAD^2``, ``v1.0^{}``, ``HEAD@{1}``, ``HEAD:README``. (Jelmer Vernooij) * Add support for ``GIT_CONFIG_GLOBAL`` and ``GIT_CONFIG_SYSTEM`` environment variables to override global and system configuration paths. (Jelmer Vernooij, #1193) * ``dulwich.porcelain.diff``: Support diffing two commits and diffing cached and working tree. (Jelmer Vernooij) * Add ``format-patch`` command in porcelain. (Jelmer Vernooij) * Add functions for creating bundles and ``BundleClient`` for interacting with bundles. (Jelmer Vernooij, #1246) * Add support for ``core.commitGraph`` configuration setting to control whether commit-graph files are used for performance optimization. (Jelmer Vernooij) * Add ``reflog`` command in porcelain. (Jelmer Vernooij) * Add ``locked_ref`` context manager for atomic ref operations. (Jelmer Vernooij) * Fix bug in ``DiskRefsContainer._remove_packed_ref`` that prevented packed ref deletions from being persisted to disk. (Jelmer Vernooij) * Optimize writing unchanged refs by avoiding unnecessary fsync when ref already has the desired value. File locking behavior is preserved to ensure proper concurrency control. (Dan Villiom Podlaski Christiansen, Jelmer Vernooij, #1120) * Fix Unicode filename encoding issue on Windows where non-ASCII filenames were corrupted during clone/checkout operations. (Jelmer Vernooij, #203) 0.23.2 2025-07-07 * Print deprecations on usage, not import. (Alyssa Coghlan, #1650) * Add support for ``core.protectHFS`` configuration setting to protect against paths that could be misinterpreted on HFS+ filesystems. (Jelmer Vernooij, #246) * Only write Git index extensions when they contain meaningful data. Previously, dulwich would write empty extensions to the index file, causing unnecessary bloat. (Andrew Shadura, Jelmer Vernooij, #1643) * Document that ``porcelain.push`` returns per-ref status information in the ``SendPackResult`` object. Added test coverage to verify this functionality works as expected. (Jelmer Vernooij, #780) * Add porcelain submodule commands: ``submodule_update``, ``submodule_add``g CLI command, and ``submodule_update`` CLI command. Add ``--recurse-submodules`` option to ``clone`` command. (#506, Jelmer Vernooij) * Support popping stashes. (Jelmer Vernooij) * Add support for parsing Git attributes from .gitattributes files. This enables proper handling of text/binary detection, line ending normalization, and filter specifications for files. (Jelmer Vernooij, #1211) * Add git bisect functionality including core bisect logic, porcelain commands (bisect_start, bisect_bad, bisect_good, bisect_skip, bisect_reset, bisect_log, bisect_replay), and CLI support. (Jelmer Vernooij, #1631) * Fix ``porcelain.describe()`` to dynamically determine hash length based on uniqueness, matching git describe behavior more closely. Previously used a hardcoded 7-character hash length. (Jelmer Vernooij, #824) * Add test for ``porcelain.add()`` to verify files can be added when the current working directory is within a gitignored directory. (Jelmer Vernooij, #550) * ParamikoSSHVendor now reads SSH configuration from ~/.ssh/config. Host settings including hostname, user, port, and identity file are now respected when establishing SSH connections. (Jelmer Vernooij, #443) 0.23.1 2025-06-30 * Support ``untracked_files="normal"`` argument to ``porcelain.status``, and make this the default. (Jelmer Vernooij, #835) * Fix ``parse_commit`` to properly dereference annotated tags when checking out tags. Previously, checking out an annotated tag would fail with a KeyError. (Jelmer Vernooij, #1638) * Handle different file type transitions properly in ``update_working_tree`` (Jelmer Vernooij, #1638) * Fix KeyError when pulling from a shallow clone. Handle missing commits gracefully in graph traversal operations for shallow repositories. (Jelmer Vernooij, #813) * Return symrefs from ls_refs. (Jelmer Vernooij, #863) * Support short commit hashes in ``porcelain.reset()``. (Jelmer Vernooij, #1154) * Support dumb repository access. (Jelmer Vernooij, #1097) * Fix TypeError when cloning repositories with bytes paths on Windows. (Jelmer Vernooij, #973) * Support ``depth`` for local clones. (Jelmer Vernooij) * Add basic support for managing Notes. (Jelmer Vernooij) * Add basic ``cherry-pick`` subcommand. (#1599, Jelmer Vernooij) * Add ``revert`` command to ``dulwich.porcelain`` and CLI. (#1599, Jelmer Vernooij) * Add annotate support as well as ``annotate`` and ``blame`` commands. (#245, Jelmer Vernooij) * Fix ``apply_delta`` to raise ``ApplyDeltaError`` instead of ``AssertionError`` when the source buffer size doesn't match the delta header. This issue only affected the pure Python implementation when the Rust extension was not available. The Rust implementation already raised the correct exception. (#1606, Jelmer Vernooij) * Fix ``porcelain.reset --hard`` to properly delete files that don't exist in the target tree. Previously, when resetting to a remote branch, files deleted in the remote were not removed locally due to incorrect path normalization on Windows. (#840, Jelmer Vernooij) * Add support for includes in configuration files. (#1216, Jelmer Vernooij) * Support timeouts for HTTP client operations. (Jelmer Vernooij) * Add support for ``reset --mixed`` and ``reset --soft`` modes in ``porcelain.reset()`` and the CLI. Mixed reset updates HEAD and index but leaves working tree unchanged. Soft reset only updates HEAD. (Jelmer Vernooij) * Apply line-ending normalization in ``build_index_from_tree`` to respect ``core.autocrlf`` configuration during checkout operations. (Jelmer Vernooij, #663) * Add ``prune`` method to object stores for cleaning up orphaned temporary pack files. This is now called by ``garbage_collect()`` to match Git's behavior. Also added ``prune`` command to ``dulwich.porcelain``. (Jelmer Vernooij, #558) * Fix ``porcelain.remove()`` to work correctly when called from a directory other than the repository root. Relative paths are now interpreted as relative to the repository root rather than the current working directory. (Jelmer Vernooij, #821) * Add support for auto garbage collection, and invoke from some porcelain commands. (Jelmer Vernooij, #1600) * Add ``filter-branch`` support to ``dulwich.porcelain`` and ``dulwich.filter_branch`` module for rewriting commit history. Supports filtering author, committer, and message fields. (#745, Jelmer Vernooij) * Add ``mv`` porcelain command. (Jelmer Vernooij, #1633) 0.23.0 2025-06-21 * Add basic ``rebase`` subcommand. (Jelmer Vernooij) * Add ``gc`` command to ``dulwich.porcelain.`` (Jelmer Vernooij, #92) * Add ``unpack-objects`` plumbing command to unpack objects from pack files into loose objects in the repository. This command extracts all objects from a pack file and writes them to the object store as individual files. Available in both ``dulwich.porcelain.unpack_objects()`` and as a CLI command ``dulwich unpack-objects``. (Jelmer Vernooij) * Add ``merge-tree`` plumbing command to ``dulwich.porcelain`` and CLI. This command performs three-way tree merges without touching the working directory or creating commits, similar to ``dulwich merge-tree``. It outputs the merged tree SHA and lists any conflicted paths. (Jelmer Vernooij) * Add ``porcelain.count_objects()`` function to count unpacked objects and their disk usage. Returns a tuple of (count, size) for simple usage or a ``CountObjectsResult`` dataclass with detailed statistics when ``verbose=True``. (Jelmer Vernooij) * Add support for pack index format version 3. This format supports variable hash sizes to enable future SHA-256 support. The implementation includes reading and writing v3 indexes with proper hash algorithm identification (1 for SHA-1, 2 for SHA-256). (Jelmer Vernooij) * Fix ``LocalGitClient`` assertion error when fetching externally cloned repositories into ``MemoryRepo``. Previously, the client would fail with an AssertionError when trying to process pack data from repositories that were cloned externally. (Jelmer Vernooij, #1179) * Add support for ``os.PathLike`` objects throughout the API. Functions that accept file paths now support ``pathlib.Path`` objects in addition to strings and bytes. This includes repository operations, configuration file handling, ignore file processing, and all major entry points. (Jelmer Vernooij, #1074) * Add support for ``format`` argument to ``Repo.init()`` and ``Repo.init_bare()`` to specify repository format version (0 or 1). This allows creating repositories with different format versions by setting the ``core.repositoryformatversion`` configuration value. (Jelmer Vernooij) * Fix Rust implementation of ``sorted_tree_items()`` to correctly handle submodules. Previously, submodules (mode 0o160000) were incorrectly treated as directories in the sorting order, causing different results compared to the Python implementation. (Jelmer Vernooij, #1325) * Fix ``porcelain.add()`` to stage both untracked and modified files when no paths are specified. Previously, only untracked files were staged, inconsistent with Git's behavior. Now behaves like ``git add -A`` when called without paths. (Jelmer Vernooij, #746) * Fix ``porcelain.add()`` symlink handling to allow adding symlinks that point outside the repository. Previously, the function would fail when trying to add a symlink pointing outside the repo due to aggressive path resolution. Now only resolves the parent directory for symlinks, matching Git's behavior. (Jelmer Vernooij, #789) * Fix ``porcelain.add()`` when adding repository root path or directories. Previously, adding the repository path itself would incorrectly stage ``b'./'`` instead of the actual untracked files, leading to repository corruption. (Jelmer Vernooij, #1178, #655) * Improve ``porcelain.add()`` documentation to correctly describe default behavior. (Jelmer Vernooij, #895) * Fix gitignore pattern matching for directory negation patterns. Patterns like ``!data/*/`` now correctly unignore direct subdirectories while still ignoring files in the parent directory, matching Git's behavior. The ``is_ignored()`` method now documents that directory paths should end with ``/`` for consistent behavior. (Jelmer Vernooij, #1203) * Support quote_path flag for ignore checking. (Jelmer Vernooij) * Clarify documentation for ``IgnoreFilter`` and ``IgnoreFilterManager`` to explicitly state that directory paths should include trailing slashes when checking if they are ignored. This matches Git's behavior and ensures consistent results. (Jelmer Vernooij, #972) * Add support for Git's ``feature.manyFiles`` configuration and index version 4. This enables faster Git operations in large repositories through path prefix compression (30-50% smaller index files) and optional hash skipping for faster writes. Supports ``feature.manyFiles``, ``index.version``, and ``index.skipHash`` configuration options. (Jelmer Vernooij, #1061, #1462) * In dulwich.porcelain docstring, list functions by their Python identifiers. (Marnanel Thurman) * cli: add basic branch management commands (James Addison, #1514) * Fix wheels workflow. (Jelmer Vernooij) * ``Config.set`` replaces values by default, ``Config.add`` appends them. (Jelmer Vernooij, #1545) * Support ``core.sshCommand`` setting. (Jelmer Vernooij, #1548) * Bump PyO3 to 0.25. (Jelmer Vernooij) * In ``SubprocessClient`` time out after 60 seconds when the subprocess hasn't terminated when closing the channel. (Jelmer Vernooij) * Add type hint for ``dulwich.client.get_ssh_vendor``. (Jelmer Vernooij, #1471) * Add basic merge command. (Jelmer Vernooij) * Update working tree in pull. (Jelmer Vernooij, #452) * Support switching branch in a way that updates working tree. (Jelmer Vernooij, #576) * Fix typing for ``dulwich.client`` methods that take repositories. (Jelmer Vernooij, #1521) * Fix handling of casing of subsection names in config. (Jelmer Vernooij, #1183) * Update working tree in pull. (Jelmer Vernooij, #452) * Use ``dissolve`` to manage deprecations. (Jelmer Vernooij) * Handle trailing backslashes in config files appropriately. (Jelmer Vernooij, #1088) * Add basic support for reading git commit graphs. (Jelmer Vernooij, #1191) * Port remaining ``dulwich.cli`` commands from getopt to argparse. (Jelmer Vernooij) * Add basic support for reftables. (Jelmer Vernooij, #1366) 0.22.8 2025-03-02 * Allow passing in plain strings to ``dulwich.porcelain.tag_create`` (Jelmer Vernooij, #1499) * Bump PyO3 to 0.23.5. (Jelmer Vernooij) * Add sparse checkout cone mode support (Louis Maddox, #1497) * Add skip-worktree support. (Louis Maddox) * Add "index.skipHash" option support. (Jan Rűegg) * Repo docstring improvements. (Marnanel Thurman) 0.22.7 2024-12-19 * Fix serializing of commits with empty commit messages. (Castedo Ellerman, #1429) 0.22.6 2024-11-16 * ``ObjectStore.iter_prefix``: fix handling of missing loose object directories. (Jelmer Vernooij) * Reject invalid refcontainer values (not 40 characters or symref). (Arun Babu Neelicattu) * Add type hints to various functions. (Castedo Ellerman) 0.22.5 2024-11-07 * Drop support for Python 3.8. (Jelmer Vernooij) * Fix refspec handling in porcelain.pull. (Jelmer Vernooij) * Drop broken refspec support in porcelain.clone. (Jelmer Vernooij) * Provide ``ref_prefix`` functionality client-side if the server does not support it. (Jelmer Vernooij) * Consistently honor ``ref_prefix`` and ``protocol_version`` arguments in client. (Jelmer Vernooij) * Strip pkt-line when negotiating protocol v2. Fixes compatibility with gerrit. (Rémy Pecqueur, #1423) * Don't pull in ``setuptools_rust`` when building pure package. (Eli Schwartz) * Return peeled refs from ``GitClient.get_refs`` if protocol-v2 is used. (Stefan Sperling, #1410) * Drop outdated performance file. (Jelmer Vernooij, #1411) 0.22.4 2024-11-01 * Fix handling of symrefs with protocol v2. (Jelmer Vernooij, #1389) * Add ``ObjectStore.iter_prefix``. (Jelmer Vernooij) * Revert back to version 3 of ``Cargo.lock``, to allow building with older Cargo versions. (Jelmer Vernooij) * Use a default ref-prefix when fetching with git protocol v2 (Stefan Sperling, #1389) * Add `ObjectStore.iter_prefix`. (Jelmer Vernooij) 0.22.3 2024-10-15 * Improve wheel building in CI, so we can upload wheels for the next release. (Jelmer Vernooij) 0.22.2 2024-10-09 * Ship ``Cargo.lock``. (Jelmer Vernooij, #1287) * Ship ``tests/`` and ``testdata/`` in sdist. (Jelmer Vernooij, #1292) * Add initial integration with OSS-Fuzz for continuous fuzz testing and first fuzzing test (David Lakin, #1302) * Drop Python 3.7 support. (Jelmer Vernooij) * Improve fuzzing coverage (David Lakin) * Support Python 3.13. (Edgar Ramírez-Mondragón, #1352) * Initial support for smart protocol v2. (Stefan Sperling) 0.22.1 2024-04-23 * Handle alternate case for worktreeconfig setting (Will Shanks, #1285) * Ship rust files. (Jelmer Vernooij, #1287) 0.22.0 2024-04-22 * Stop installing docs/ as part of package code. (Jelmer Vernooij, #1248) * Move tests to root. (Jelmer Vernooij, #1024) * Convert the optional C implementations to Rust. (Jelmer Vernooij) 0.21.7 2023-12-05 * Fix NameError when encountering errors during HTTP operation. (Jelmer Vernooij, #1208) * Raise exception when default identity can't be found. (Jelmer Vernooij) * Add a dedicated exception class for unresolved deltas. (Jelmer Vernooij, #1221) * Support credentials in proxy URL. (Jelmer Vernooij, #1227) * Add ``dulwich.porcelain.for_each_ref``. (Daniele Trifirò) 0.21.6 2023-09-02 * Convert _objects.c to rust. (Jelmer Vernooij) * index: Handle different stages of conflicted paths. (Kevin Hendricks, Jelmer Vernooij) * Improve LCA finding performance. (Kevin Hendricks) * client: Handle Content-Type with encoding set. (Antoine Lambert) * Only import _hashlib for type checking. (David Hotham) * Update docs regarding building dulwich without c bindings (#103) (Adam Plaice) * objects: Define a stricter return type for _parse_message (Vincent Lorentz) * Raise GitProtocolError when encountering HTTP Errors in HTTPGitClient. (Jelmer Vernooij, #1199) 0.21.5 2023-05-04 * Be more tolerant to non-3-length tuple versions. (Jelmer Vernooij) 0.21.4.1 2023-05-04 * Support ``core.symlinks=false``. (Jelmer Vernooij, #1169) * Deprecate ``dulwich.objects.parse_commit``. * Fix fetching into MemoryRepo. (Jelmer Vernooij, #1157) * Support ``init.defaultBranch`` config. (Jelmer Vernooij) * Fix ``ObjectStore.iterobjects_subset()`` when hex shas are passed for objects that live in packs. (Jelmer Vernooij, #1166) * client: Handle absolute path as redirect location in HTTP client. (Antoine Lambert) 0.21.3 2023-02-17 * Add support for ``worktreeconfig`` extension. (Jelmer Vernooij) * Deprecate ``Commit.extra``; the Git project specifically discourages adding custom lines, and the contents of ``Commit.extra`` are unpredictable as contents may be different between different versions of Dulwich with support for different headers. ``Commit._extra`` still exists. (Jelmer Vernooij) 0.21.2 2023-01-18 * Fix early file close bug in ``dulwich.pack.extend_pack``. (Jelmer Vernooij) 0.21.1 2023-01-17 * Factor out ``dulwich.pack.extend_pack``. (Jelmer Vernooij) 0.21.0 2023-01-16 * Pack internals have been significantly refactored, including significant low-level API changes. As a consequence of this, Dulwich now reuses pack deltas when communicating with remote servers, which brings a big boost to network performance. (Jelmer Vernooij) * Add 'pack-refs' command. (Dan Villiom Podlaski Christiansen) * Handle more errors when trying to read a ref (Dan Villiom Podlaski Christiansen) * Allow for reuse of existing deltas while creating pack files (Stefan Sperling) * cli: fix argument parsing for pack-objects --stdout (Stefan Sperling) * cli: open pack-objects output files in binary mode to avoid write() error (Stefan Sperling) * Bump minimum python version to 3.7. (Jelmer Vernooij) * honor no_proxy environment variable (#1098, afaul) * In HTTP Git Client, allow missing Content-Type. (Jelmer Vernooij) * Fix --pure builds (Jelmer Vernooij, #1093) * Allow passing abbrev to describe (#1084, Seppo Yli-Olli) 0.20.50 2022-10-30 * Add --deltify option to ``dulwich pack-objects`` which enables deltification, and add initial support for reusing suitable deltas found in an existing pack file. (Stefan Sperling) * Fix Repo.reset_index. Previously, it instead took the union with the given tree. (Christian Sattler, #1072) * Add -b argument to ``dulwich clone``. (Jelmer Vernooij) * On Windows, provide a hint about developer mode when creating symlinks fails due to a permission error. (Jelmer Vernooij, #1005) * Add new ``ObjectID`` type in ``dulwich.objects``, currently just an alias for ``bytes``. (Jelmer Vernooij) * Support repository format version 1. (Jelmer Vernooij, #1056) * Support \r\n line endings with continuations when parsing configuration files. (Jelmer Vernooij) * Fix handling of SymrefLoop in RefsContainer.__setitem__. (Dominic Davis-Foster, Jelmer Vernooij) 0.20.46 2022-09-06 * Apply insteadOf to rsync-style location strings (previously it was just applied to URLs). (Jelmer Vernooij, python-poetry/poetry#6329) * Drop use of certifi, instead relying on urllib3's default code to find system CAs. (Jelmer Vernooij, #1025) * Implement timezone parsing in porcelain. (springheeledjack0, #1026) * Drop support for running without setuptools. (Jelmer Vernooij) * Ensure configuration is loaded when running "dulwich clone". (Jelmer Vernooij) * Build 32 bit wheels for Windows. (Benjamin Parzella) * tests: Ignore errors when deleting GNUPGg home directory. Fixes spurious errors racing gnupg-agent. Thanks, Matěj Cepl. Fixes #1000 * config: Support closing brackets in quotes in section names. (Jelmer Vernooij, #10124) * Various and formatting fixes. (Kian-Meng Ang) * Document basic authentication in dulwich.porcelain.clone. (TuringTux) * Flush before calling fsync, ensuring buffers are filled. (wernha) * Support GPG commit signing. (springheeledjack0) * Add python 3.11 support. (Saugat Pachhai) * Allow missing GPG during tests. (Jakub Kulík) * status: return posix-style untracked paths instead of nt-style paths on win32 (Daniele Trifirò) * Honour PATH environment when running C Git for testing. (Stefan Sperling) * Split out exception for symbolic reference loops. (Jelmer Vernooij) * Move various long-deprecated methods. (Jelmer Vernooij) 0.20.45 2022-07-15 * Add basic ``dulwich.porcelain.submodule_list`` and ``dulwich.porcelain.submodule_add`` (Jelmer Vernooij) 0.20.44 2022-06-30 * Fix reading of chunks in server. (Jelmer Vernooij, #977) * Support applying of URL rewriting using ``insteadOf`` / ``pushInsteadOf``. (Jelmer Vernooij, #706) 0.20.43 2022-06-07 * Lazily import url2pathname. (Jelmer Vernooij) * Drop caching of full HTTP response. Attempt #2. (jelmer Vernooij, Antoine Lambert, #966) 0.20.42 2022-05-24 * Drop ``RefsContainer.watch`` that was always flaky. (Jelmer Vernooij, #886) 0.20.41 2022-05-24 * Fix wheel uploading, properly. (Ruslan Kuprieiev) 0.20.40 2022-05-19 * Fix wheel uploading. (Daniele Trifirò, Jelmer Vernooij) 0.20.39 2022-05-19 0.20.38 2022-05-17 * Disable paramiko tests if paramiko is not available. (Michał Górny) * Set flag to re-enable paramiko server side on gentoo for running paramiko tests. (Michał Górny) * Increase tolerance when comparing time stamps; fixes some spurious test failures on slow CI systems. (Jelmer Vernooij) * Revert removal of caching of full HTTP response. This breaks access to some HTTP servers. (Jelmer Vernooij) 0.20.37 2022-05-16 * Avoid making an extra copy when fetching pack files. (Jelmer Vernooij) * Add ``porcelain.remote_remove``. (Jelmer Vernooij, #923) 0.20.36 2022-05-15 * Add ``walk_untracked`` argument to ``porcelain.status``. (Daniele Trifirò) * Add tests for paramiko SSH Vendor. (Filipp Frizzy) 0.20.35 2022-03-20 * Document the ``path`` attribute for ``Repo``. (Jelmer Vernooij, #854) 0.20.34 2022-03-14 * Add support for multivars in configuration. (Jelmer Vernooij, #718) 0.20.33 2022-03-05 * Fix handling of escaped characters in ignore patterns. (Jelmer Vernooij, #930) * Add ``dulwich.contrib.requests_vendor``. (epopcon) * Ensure git config is available in a linked working tree. (Jesse Cureton, #926) 0.20.32 2022-01-24 * Properly close result repository during test. (Jelmer Vernooij, #928) 0.20.31 2022-01-21 * Add GitClient.clone(). (Jelmer Vernooij, #920) 0.20.30 2022-01-08 0.20.29 2022-01-08 * Support staging submodules. (Jelmer Vernooij) * Drop deprecated Index.iterblobs and iter_fresh_blobs. (Jelmer Vernooij) * Unify clone behaviour of ``Repo.clone`` and ``porcelain.clone``, and add branch parameter for clone. (Peter Rowlands, #851) 0.20.28 2022-01-05 * Fix hook test on Mac OSX / Linux when dulwich is not installed system-wide. (Jelmer Vernooij, #919) * Cope with gecos being unset. (Jelmer Vernooij, #917) 0.20.27 2022-01-04 * Allow adding files to repository in pre-commit hook. (Jelmer Vernooij, #916) * Raise SubmoduleEncountered in ``tree_lookup_path``. (Jelmer Vernooij) 0.20.26 2021-10-29 * Support os.PathLike arguments to Repo.stage(). (Jan Wiśniewski, #907) * Drop support for Python 3.5. (Jelmer Vernooij) * Add ``dulwich.porcelain._reset_file``. (Ded_Secer) * Add ``Repo.unstage``. (Ded_Secer) 0.20.25 2021-08-23 * Fix ``dulwich`` script when installed via setup.py. (Dan Villiom Podlaski Christiansen) * Make default file mask consistent with Git. (Dan Villiom Podlaski Christiansen, #884) 0.20.24 2021-07-18 * config: disregard UTF-8 BOM when reading file. (Dan Villiom Podlaski Christiansen) * Skip lines with spaces only in .gitignore. (Andrey Torsunov, #878) * Add a separate HTTPProxyUnauthorized exception for 407 errors. (Jelmer Vernooij, #822) * Split out a AbstractHTTPGitClient class. (Jelmer Vernooij) 0.20.23 2021-05-24 * Fix installation of GPG during package publishing. (Ruslan Kuprieiev) 0.20.22 2021-05-24 * Prevent removal of refs directory when the last ref is deleted. (Jelmer Vernooij) * Fix filename: MERGE_HEADS => MERGE_HEAD. (Jelmer Vernooij, #861) * For ignored directories, porcelain.add and porcelain.status now only return the path to directory itself in the list of ignored paths. Previously, paths for all files within the directory would also be included in the list. (Peter Rowlands, #853) * Provide depth argument to ``determine_wants``. (Peter Rowlands) * Various tag signature handling improvements. (Daniel Murphy) * Add separate Tag.verify(). (Peter Rowlands) * Add support for version 3 index files. (Jelmer Vernooij) * Fix autocrlf=input handling. (Peter Rowlands, Boris Feld) * Attempt to find C Git global config on Windows. (Peter Rowlands) API CHANGES * The APIs for writing and reading individual index entries have changed to handle lists of (name, entry) tuples rather than tuples. 0.20.21 2021-03-20 * Add basic support for a GcsObjectStore that stores pack files in gcs. (Jelmer Vernooij) * In porcelain.push, default to local active branch. (Jelmer Vernooij, #846) * Support fetching symrefs. (Jelmer Vernooij, #485, #847) * Add aarch64 wheel building. (odidev, Jelmer Vernooij) 0.20.20 2021-03-03 * Implement ``Stash.drop``. (Peter Rowlands) * Support untracked symlinks to paths outside the repository. (Peter Rowlands, #842) 0.20.19 2021-02-11 * Fix handling of negative matches in nested gitignores. (Corentin Hembise, #836) 0.20.18 2021-02-04 * Fix formatting in setup.py. (Jelmer Vernooij) * Add release configuration. (Jelmer Vernooij) 0.20.17 2021-02-04 * credentials: ignore end-of-line character. (Georges Racinet) * Fix failure in get_untracked_paths when the repository contains symlinks. (#830, #793, mattseddon) * docs: Clarify that Git objects are created on `git add`. (Utku Gultopu) 0.20.16 2021-01-16 * Add flag to only attempt to fetch ignored untracked files when specifically requested. (Matt Seddon) 0.20.15 2020-12-23 * Add some functions for parsing and writing bundles. (Jelmer Vernooij) * Add ``no_verify`` flag to ``porcelain.commit`` and ``Repo.do_commit``. (Peter Rowlands) * Remove dependency on external mock module. (Matěj Cepl, #820) 0.20.14 2020-11-26 * Fix some stash functions on Python 3. (Peter Rowlands) * Fix handling of relative paths in alternates files on Python 3. (Georges Racinet) 0.20.13 2020-11-22 * Add py.typed to allow type checking. (David Caro) * Add tests demonstrating a bug in the walker code. (Doug Hellman) 0.20.11 2020-10-30 * Fix wheels build on Linux. (Ruslan Kuprieiev) * Enable wheels build for Python 3.9 on Linux. (Jelmer Vernooij) 0.20.8 2020-10-29 * Build wheels on Mac OS X / Windows for Python 3.9. (Jelmer Vernooij) 0.20.7 2020-10-29 * Check core.repositoryformatversion. (Jelmer Vernooij, #803) * Fix ACK/NACK handling in archive command handling in dulwich.client. (DzmitrySudnik, #805) 0.20.6 2020-08-29 * Add a ``RefsContainer.watch`` interface. (Jelmer Vernooij, #751) * Fix pushing of new branches from porcelain.push. (Jelmer Vernooij, #788) * Honor shallows when pushing from a shallow clone. (Jelmer Vernooij, #794) * Fix porcelain.path_to_tree_path for Python 3.5. (Boris Feld, #777) * Add support for honor proxy environment variables for HTTP. (Aurélien Campéas, #797) 0.20.5 2020-06-22 * Print a clearer exception when setup.py is executed on Python < 3.5. (Jelmer Vernooij, #783) * Send an empty pack to clients if they requested objects, even if they already have those objects. Thanks to Martijn Pieters for the detailed bug report. (Jelmer Vernooij, #781) * porcelain.pull: Don't ask for objects that we already have. (Jelmer Vernooij, #782) * Add LCA implementation. (Kevin Hendricks) * Add functionality for finding the merge base. (Kevin Hendricks) * Check for diverged branches during push. (Jelmer Vernooij, #494) * Check for fast-forward during pull. (Jelmer Vernooij, #666) * Return a SendPackResult object from GitClient.send_pack(). (Jelmer Vernooij) * ``GitClient.send_pack`` now sets the ``ref_status`` attribute on its return value to a dictionary mapping ref names to error messages. Previously, it raised UpdateRefsError if any of the refs failed to update. (Jelmer Vernooij, #780) * Add a ``porcelain.Error`` object that most errors in porcelain derive from. (Jelmer Vernooij) * Fix argument parsing in dulwich command-line app. (Jelmer Vernooij, #784) 0.20.3 2020-06-14 * Add support for remembering remote refs after push/pull. (Jelmer Vernooij, #752) * Support passing tree and output encoding to dulwich.patch.unified_diff. (Jelmer Vernooij, #763) * Fix pushing of new refs over HTTP(S) when there are no new objects to be sent. (Jelmer Vernooij, #739) * Raise new error HTTPUnauthorized when the server sends back a 401. The client can then retry with credentials. (Jelmer Vernooij, #691) * Move the guts of bin/dulwich to dulwich.cli, so it is easier to test or import. (Jelmer Vernooij) * Install dulwich script from entry_points when setuptools is available, making it slightly easier to use on Windows. (Jelmer Vernooij, #540) * Set python_requires>=3.5 in setup.py. (Manuel Jacob) 0.20.2 2020-06-01 * Brown bag release to fix uploads of Windows wheels. 0.20.1 2020-06-01 * Publish binary wheels for: Windows, Linux, Mac OS X. (Jelmer Vernooij, #711, #710, #629) 0.20.0 2020-06-01 * Drop support for Python 2. (Jelmer Vernooij) * Only return files from the loose store that look like git objects. (Nicolas Dandrimont) * Ignore agent= capability if sent by client. (Jelmer Vernooij) * Don't break when encountering block devices. (Jelmer Vernooij) * Decode URL paths in HttpGitClient using utf-8 rather than file system encoding. (Manuel Jacob) * Fix pushing from a shallow clone. (Brecht Machiels, #705) 0.19.16 2020-04-17 * Don't send "deepen None" to server if graph walker supports shallow. (Jelmer Vernooij, #747) * Support tweaking the compression level for loose objects through the "core.looseCompression" and "core.compression" settings. (Jelmer Vernooij) * Support tweaking the compression level for pack objects through the "core.packCompression" and "core.compression" settings. (Jelmer Vernooij) * Add a "dulwich.contrib.diffstat" module. (Kevin Hendricks) 0.19.15 2020-01-26 * Properly handle files that are just executable for the current user. (Jelmer Vernooij, #734) * Fix handling of stored encoding in ``dulwich.porcelain.get_object_by_path`` on Python 3. (Jelmer Vernooij) * Support the include_trees and rename_detector arguments at the same time when diffing trees. (Jelmer Vernooij) 0.19.14 2019-11-30 * Strip superfluous <> around email. (monnerat) * Stop checking for ref validity client-side. Users can still call check_wants manually. (Jelmer Vernooij) * Switch over to Google-style docstrings. (Jelmer Vernooij) * Add a ``dulwich.porcelain.active_branch`` function. (Jelmer Vernooij) * Cleanup new directory if clone fails. (Jelmer Vernooij, #733) * Expand "~" in global exclude path. (Jelmer Vernooij) 0.19.13 2019-08-19 BUG FIXES * Avoid ``PermissionError``, since it is Python3-specific. (Jelmer Vernooij) * Fix regression that added a dependency on C git for the test suite. (Jelmer Vernooij, #720) * Fix compatibility with Python 3.8 - mostly deprecation warnings. (Jelmer Vernooij) 0.19.12 2019-08-13 BUG FIXES * Update directory detection for `get_unstaged_changes` for Python 3. (Boris Feld, #684) * Add a basic ``porcelain.clean``. (Lane Barlow, #398) * Fix output format of ``porcelain.diff`` to match that of C Git. (Boris Feld) * Return a 404 not found error when repository is not found. * Mark ``.git`` directories as hidden on Windows. (Martin Packman, #585) * Implement ``RefsContainer.__iter__`` (Jelmer Vernooij, #717) * Don't trust modes if they can't be modified after a file has been created. (Jelmer Vernooij, #719) 0.19.11 2019-02-07 IMPROVEMENTS * Use fullname from gecos field, if available. (Jelmer Vernooij) * Support ``GIT_AUTHOR_NAME`` / ``GIT_AUTHOR_EMAIL``. (Jelmer Vernooij) * Add support for short ids in parse_commit. (Jelmer Vernooij) * Add support for ``prune`` and ``prune_tags`` arguments to ``porcelain.fetch``. (Jelmer Vernooij, #681) BUG FIXES * Fix handling of race conditions when new packs appear. (Jelmer Vernooij) 0.19.10 2018-01-15 IMPROVEMENTS * Add `dulwich.porcelain.write_tree`. (Jelmer Vernooij) * Support reading ``MERGE_HEADS`` in ``Repo.do_commit``. (Jelmer Vernooij) * Import from ``collections.abc`` rather than ``collections`` where applicable. Required for 3.8 compatibility. (Jelmer Vernooij) * Support plain strings as refspec arguments to ``dulwich.porcelain.push``. (Jelmer Vernooij) * Add support for creating signed tags. (Jelmer Vernooij, #542) BUG FIXES * Handle invalid ref that pretends to be a sub-folder under a valid ref. (KS Chan) 0.19.9 2018-11-17 BUG FIXES * Avoid fetching ghosts in ``Repo.fetch``. (Jelmer Vernooij) * Preserve port and username in parsed HTTP URLs. (Jelmer Vernooij) * Add basic server side implementation of ``git-upload-archive``. (Jelmer Vernooij) 0.19.8 2018-11-06 * Fix encoding when reading README file in setup.py. (egor , #668) 0.19.7 2018-11-05 CHANGES * Drop support for Python 3 < 3.4. This is because pkg_resources (which get used by setuptools and mock) no longer supports 3.3 and earlier. (Jelmer Vernooij) IMPROVEMENTS * Support ``depth`` argument to ``GitClient.fetch_pack`` and support fetching and updating shallow metadata. (Jelmer Vernooij, #240) BUG FIXES * Don't write to stdout and stderr when they are not available (such as is the case for pythonw). (Sylvia van Os, #652) * Fix compatibility with newer versions of git, which expect CONTENT_LENGTH to be set to 0 for empty body requests. (Jelmer Vernooij, #657) * Raise an exception client-side when a caller tries to request SHAs that are not directly referenced the servers' refs. (Jelmer Vernooij) * Raise more informative errors when unable to connect to repository over SSH or subprocess. (Jelmer Vernooij) * Handle commit identity fields with multiple ">" characters. (Nicolas Dandrimont) IMPROVEMENTS * ``dulwich.porcelain.get_object_by_path`` method for easily accessing a path in another tree. (Jelmer Vernooij) * Support the ``i18n.commitEncoding`` setting in config. (Jelmer Vernooij) 0.19.6 2018-08-11 BUG FIXES * Fix support for custom transport arguments in ``dulwich.porcelain.clone``. (Semyon Slepov) * Fix compatibility with Python 3.8 (Jelmer Vernooij, Daniel M. Capella) * Fix some corner cases in ``path_to_tree_path``. (Romain Keramitas) * Support paths as bytestrings in various places in ``dulwich.index`` (Jelmer Vernooij) * Avoid setup.cfg for now, since it seems to break pypi metadata. (Jelmer Vernooij, #658) 0.19.5 2018-07-08 IMPROVEMENTS * Add ``porcelain.describe``. (Sylvia van Os) BUG FIXES * Fix regression in ``dulwich.porcelain.clone`` that prevented cloning of remote repositories. (Jelmer Vernooij, #639) * Don't leave around empty parent directories for removed refs. (Damien Tournoud, #640) 0.19.4 2018-06-24 IMPROVEMENTS * Add ``porcelain.ls_files``. (Jelmer Vernooij) * Add ``Index.items``. (Jelmer Vernooij) BUG FIXES * Avoid unicode characters (e.g. the digraph ij in my surname) in setup.cfg, since setuptools doesn't deal well with them. See https://github.com/pypa/setuptools/issues/1062. (Jelmer Vernooij, #637) 0.19.3 2018-06-17 IMPROVEMENTS * Add really basic `dulwich.porcelain.fsck` implementation. (Jelmer Vernooij) * When the `DULWICH_PDB` environment variable is set, make SIGQUIT open pdb in the 'dulwich' command. * Add `checkout` argument to `Repo.clone`. (Jelmer Vernooij, #503) * Add `Repo.get_shallow` method. (Jelmer Vernooij) * Add basic `dulwich.stash` module. (Jelmer Vernooij) * Support a `prefix` argument to `dulwich.archive.tar_stream`. (Jelmer Vernooij) BUG FIXES * Fix handling of encoding for tags. (Jelmer Vernooij, #608) * Fix tutorial tests on Python 3. (Jelmer Vernooij, #573) * Fix remote refs created by `porcelain.fetch`. (Daniel Andersson, #623) * More robust pack creation on Windows. (Daniel Andersson) * Fix recursive option for `porcelain.ls_tree`. (Romain Keramitas) TESTS * Some improvements to paramiko tests. (Filipp Frizzy) 0.19.2 2018-04-07 BUG FIXES * Fix deprecated Index.iterblobs method. (Jelmer Vernooij) 0.19.1 2018-04-05 IMPROVEMENTS * Add 'dulwich.mailmap' file for reading mailmap files. (Jelmer Vernooij) * Dulwich no longer depends on urllib3[secure]. Instead, "dulwich[https]" can be used to pull in the necessary dependencies for HTTPS support. (Jelmer Vernooij, #616) * Support the `http.sslVerify` and `http.sslCAInfo` configuration options. (Jelmer Vernooij) * Factor out `dulwich.client.parse_rsync_url` function. (Jelmer Vernooij) * Fix repeat HTTP requests using the same smart HTTP client. (Jelmer Vernooij) * New 'client.PLinkSSHVendor' for creating connections using PuTTY's plink.exe. (Adam Bradley, Filipp Frizzy) * Only pass in `key_filename` and `password` to SSHVendor implementations if those parameters are set. (This helps with older SSHVendor implementations) (Jelmer Vernooij) API CHANGES * Index.iterblobs has been renamed to Index.iterobjects. (Jelmer Vernooij) 0.19.0 2018-03-10 BUG FIXES * Make `dulwich.archive` set the gzip header file modification time so that archives created from the same Git tree are always identical. (#577, Jonas Haag) * Allow comment characters (#, ;) within configuration file strings (Daniel Andersson, #579) * Raise exception when passing in invalid author/committer values to Repo.do_commit(). (Jelmer Vernooij, #602) IMPROVEMENTS * Add a fastimport ``extra``. (Jelmer Vernooij) * Start writing reflog entries. (Jelmer Vernooij) * Add ability to use password and keyfile ssh options with SSHVendor. (Filipp Kucheryavy) * Add ``change_type_same`` flag to ``tree_changes``. (Jelmer Vernooij) API CHANGES * ``GitClient.send_pack`` now accepts a ``generate_pack_data`` rather than a ``generate_pack_contents`` function for performance reasons. (Jelmer Vernooij) * Dulwich now uses urllib3 internally for HTTP requests. The `opener` argument to `dulwich.client.HttpGitClient` that took a `urllib2` opener instance has been replaced by a `pool_manager` argument that takes a `urllib3` pool manager instance. (Daniel Andersson) 0.18.6 2017-11-11 BUG FIXES * Fix handling of empty repositories in ``porcelain.clone``. (#570, Jelmer Vernooij) * Raise an error when attempting to add paths that are not under the repository. (Jelmer Vernooij) * Fix error message for missing trailing ]. (Daniel Andersson) * Raise EmptyFileException when corruption (in the form of an empty file) is detected. (Antoine R. Dumont, #582) IMPROVEMENTS * Enforce date field parsing consistency. This also add checks on those date fields for potential overflow. (Antoine R. Dumont, #567) 0.18.5 2017-10-29 BUG FIXES * Fix cwd for hooks. (Fabian Grünbichler) * Fix setting of origin in config when non-standard origin is passed into ``Repo.clone``. (Kenneth Lareau, #565) * Prevent setting SSH arguments from SSH URLs when using SSH through a subprocess. Note that Dulwich doesn't support cloning submodules. (CVE-2017-16228) (Jelmer Vernooij) IMPROVEMENTS * Silently ignored directories in ``Repo.stage``. (Jelmer Vernooij, #564) API CHANGES * GitFile now raises ``FileLocked`` when encountering a lock rather than OSError(EEXIST). (Jelmer Vernooij) 0.18.4 2017-10-01 BUG FIXES * Make default User-Agent start with "git/" because GitHub won't response to HTTP smart server requests otherwise (and reply with a 404). (Jelmer vernooij, #562) 0.18.3 2017-09-03 BUG FIXES * Read config during porcelain operations that involve remotes. (Jelmer Vernooij, #545) * Fix headers of empty chunks in unified diffs. (Taras Postument, #543) * Properly follow redirects over HTTP. (Jelmer Vernooij, #117) IMPROVEMENTS * Add ``dulwich.porcelain.update_head``. (Jelmer Vernooij, #439) * ``GitClient.fetch_pack`` now returns symrefs. (Jelmer Vernooij, #485) * The server now supports providing symrefs. (Jelmer Vernooij, #485) * Add ``dulwich.object_store.commit_tree_changes`` to incrementally commit changes to a tree structure. (Jelmer Vernooij) * Add basic ``PackBasedObjectStore.repack`` method. (Jelmer Vernooij, Earl Chew, #296, #549, #552) 0.18.2 2017-08-01 TEST FIXES * Use constant timestamp so tests pass in all timezones, not just BST. (Jelmer Vernooij) 0.18.1 2017-07-31 BUG FIXES * Fix syntax error in dulwich.contrib.test_swift_smoke. (Jelmer Vernooij) 0.18.0 2017-07-31 BUG FIXES * Fix remaining tests on Windows. (Jelmer Vernooij, #493) * Fix build of C extensions with Python 3 on Windows. (Jelmer Vernooij) * Pass 'mkdir' argument onto Repo.init_bare in Repo.clone. (Jelmer Vernooij, #504) * In ``dulwich.porcelain.add``, if no files are specified, add from current working directory rather than repository root. (Jelmer Vernooij, #521) * Properly deal with submodules in 'porcelain.status'. (Jelmer Vernooij, #517) * ``dulwich.porcelain.remove`` now actually removes files from disk, not just from the index. (Jelmer Vernooij, #488) * Fix handling of "reset" command with markers and without "from". (Antoine Pietri) * Fix handling of "merge" command with markers. (Antoine Pietri) * Support treeish argument to porcelain.reset(), rather than requiring a ref/commit id. (Jelmer Vernooij) * Handle race condition when mtime doesn't change between writes/reads. (Jelmer Vernooij, #541) * Fix ``dulwich.porcelain.show`` on commits with Python 3. (Jelmer Vernooij, #532) IMPROVEMENTS * Add basic support for reading ignore files in ``dulwich.ignore``. ``dulwich.porcelain.add`` and ``dulwich.porcelain.status`` now honor ignores. (Jelmer Vernooij, Segev Finer, #524, #526) * New ``dulwich.porcelain.check_ignore`` command. (Jelmer Vernooij) * ``dulwich.porcelain.status`` now supports a ``ignored`` argument. (Jelmer Vernooij) DOCUMENTATION * Clarified docstrings for Client.{send_pack,fetch_pack} implementations. (Jelmer Vernooij, #523) 0.17.3 2017-03-20 PLATFORM SUPPORT * List Python 3.3 as supported. (Jelmer Vernooij, #513) BUG FIXES * Fix compatibility with pypy 3. (Jelmer Vernooij) 0.17.2 2017-03-19 BUG FIXES * Add workaround for https://bitbucket.org/pypy/pypy/issues/2499/cpyext-pystring_asstring-doesnt-work, fixing Dulwich when used with C extensions on pypy < 5.6. (Victor Stinner) * Properly quote config values with a '#' character in them. (Jelmer Vernooij, #511) 0.17.1 2017-03-01 IMPROVEMENTS * Add basic 'dulwich pull' command. (Jelmer Vernooij) BUG FIXES * Cope with existing submodules during pull. (Jelmer Vernooij, #505) 0.17.0 2017-03-01 TEST FIXES * Skip test that requires sync to synchronize filesystems if os.sync is not available. (Koen Martens) IMPROVEMENTS * Implement MemoryRepo.{set_description,get_description}. (Jelmer Vernooij) * Raise exception in Repo.stage() when absolute paths are passed in. Allow passing in relative paths to porcelain.add().(Jelmer Vernooij) BUG FIXES * Handle multi-line quoted values in config files. (Jelmer Vernooij, #495) * Allow porcelain.clone of repository without HEAD. (Jelmer Vernooij, #501) * Support passing tag ids to Walker()'s include argument. (Jelmer Vernooij) * Don't strip trailing newlines from extra headers. (Nicolas Dandrimont) * Set bufsize=0 for subprocess interaction with SSH client. Fixes hangs on Python 3. (René Stern, #434) * Don't drop first slash for SSH paths, except for those starting with "~". (Jelmer Vernooij, René Stern, #463) * Properly log off after retrieving just refs. (Jelmer Vernooij) 0.16.3 2016-01-14 TEST FIXES * Remove racy check that relies on clock time changing between writes. (Jelmer Vernooij) IMPROVEMENTS * Add porcelain.remote_add. (Jelmer Vernooij) 0.16.2 2016-01-14 IMPROVEMENTS * Fixed failing test-cases on windows. (Koen Martens) API CHANGES * Repo is now a context manager, so that it can be easily closed using a ``with`` statement. (Søren Løvborg) IMPROVEMENTS * Add naive annotate implementation in ``dulwich.annotate``. It works, but performance needs work. (Jelmer Vernooij) TEST FIXES * Only run worktree list compat tests against git 2.7.0, when 'git worktree list' was introduced. (Jelmer Vernooij) BUG FIXES * Ignore filemode when building index when core.filemode is false. (Koen Martens) * Initialize core.filemode configuration setting by probing the filesystem for trustable permissions. (Koen Martens) * Fix ``porcelain.reset`` to respect the committish argument. (Koen Martens) * Fix dulwich.porcelain.ls_remote() on Python 3. (#471, Jelmer Vernooij) * Allow both unicode and byte strings for host paths in dulwich.client. (#435, Jelmer Vernooij) * Add remote from porcelain.clone. (#466, Jelmer Vernooij) * Fix unquoting of credentials before passing to urllib2. (#475, Volodymyr Holovko) * Cope with submodules in `build_index_from_tree`. (#477, Jelmer Vernooij) * Handle deleted files in `get_unstaged_changes`. (#483, Doug Hellmann) * Don't overwrite files when they haven't changed in `build_file_from_blob`. (#479, Benoît HERVIER) * Check for existence of index file before opening pack. Fixes a race when new packs are being added. (#482, wme) 0.16.1 2016-12-25 BUG FIXES * Fix python3 compatibility for dulwich.contrib.release_robot. (Jelmer Vernooij) 0.16.0 2016-12-24 IMPROVEMENTS * Add support for worktrees. See `git-worktree(1)` and `gitrepository-layout(5)`. (Laurent Rineau) * Add support for `commondir` file in Git control directories. (Laurent Rineau) * Add support for passwords in HTTP URLs. (Jon Bain, Mika Mäenpää) * Add `release_robot` script to contrib, allowing easy finding of current version based on Git tags. (Mark Mikofski) * Add ``Blob.splitlines`` method. (Jelmer Vernooij) BUG FIXES * Fix handling of ``Commit.tree`` being set to an actual tree object rather than a tree id. (Jelmer Vernooij) * Return remote refs from LocalGitClient.fetch_pack(), consistent with the documentation for that method. (#461, Jelmer Vernooij) * Fix handling of unknown URL schemes in get_transport_and_path. (#465, Jelmer Vernooij) 0.15.0 2016-10-09 BUG FIXES * Allow missing trailing LF when reading service name from HTTP servers. (Jelmer Vernooij, Andrew Shadura, #442) * Fix dulwich.porcelain.pull() on Python3. (Jelmer Vernooij, #451) * Properly pull in tags during dulwich.porcelain.clone. (Jelmer Vernooij, #408) CHANGES * Changed license from "GNU General Public License, version 2.0 or later" to "Apache License, version 2.0 or later or GNU General Public License, version 2.0 or later". (#153) IMPROVEMENTS * Add ``dulwich.porcelain.ls_tree`` implementation. (Jelmer Vernooij) 0.14.1 2016-07-05 BUG FIXES * Fix regression removing untouched refs when pushing over SSH. (Jelmer Vernooij #441) * Skip Python3 tests for SWIFT contrib module, as it has not yet been ported. 0.14.0 2016-07-03 BUG FIXES * Fix ShaFile.id after modification of a copied ShaFile. (Félix Mattrat, Jelmer Vernooij) * Support removing refs from porcelain.push. (Jelmer Vernooij, #437) * Stop magic protocol ref `capabilities^{}` from leaking out to clients. (Jelmer Vernooij, #254) IMPROVEMENTS * Add `dulwich.config.parse_submodules` function. * Add `RefsContainer.follow` method. (#438) 0.13.0 2016-04-24 IMPROVEMENTS * Support `ssh://` URLs in get_transport_and_path_from_url(). (Jelmer Vernooij, #402) * Support missing empty line after headers in Git commits and tags. (Nicolas Dandrimont, #413) * Fix `dulwich.porcelain.status` when used in empty trees. (Jelmer Vernooij, #415) * Return copies of objects in MemoryObjectStore rather than references, making the behaviour more consistent with that of DiskObjectStore. (Félix Mattrat, Jelmer Vernooij) * Fix ``dulwich.web`` on Python3. (#295, Jonas Haag) CHANGES * Drop support for Python 2.6. * Fix python3 client web support. (Jelmer Vernooij) BUG FIXES * Fix hang on Gzip decompression. (Jonas Haag) * Don't rely on working tell() and seek() methods on wsgi.input. (Jonas Haag) * Support fastexport/fastimport functionality on python3 with newer versions of fastimport (>= 0.9.5). (Jelmer Vernooij, Félix Mattrat) 0.12.0 2015-12-13 IMPROVEMENTS * Add a `dulwich.archive` module that can create tarballs. Based on code from Jonas Haag in klaus. * Add a `dulwich.reflog` module for reading and writing reflogs. (Jelmer Vernooij) * Fix handling of ambiguous refs in `parse_ref` to make it match the behaviour described in https://git-scm.com/docs/gitrevisions. (Chris Bunney) * Support Python3 in C modules. (Lele Gaifax) BUG FIXES * Simplify handling of SSH command invocation. Fixes quoting of paths. Thanks, Thomas Liebetraut. (#384) * Fix inconsistent handling of trailing slashes for DictRefsContainer. (#383) * Add hack to support thin packs duing fetch(), albeit while requiring the entire pack file to be loaded into memory. (jsbain) CHANGES * This will be the last release to support Python 2.6. 0.11.2 2015-09-18 IMPROVEMENTS * Add support for agent= capability. (Jelmer Vernooij, #298) * Add support for quiet capability. (Jelmer Vernooij) CHANGES * The ParamikoSSHVendor class has been moved to * dulwich.contrib.paramiko_vendor, as it's currently untested. (Jelmer Vernooij, #364) 0.11.1 2015-09-13 Fix-up release to exclude broken blame.py file. 0.11.0 2015-09-13 IMPROVEMENTS * Extended Python3 support to most of the codebase. (Gary van der Merwe, Jelmer Vernooij) * The `Repo` object has a new `close` method that can be called to close any open resources. (Gary van der Merwe) * Support 'git.bat' in SubprocessGitClient on Windows. (Stefan Zimmermann) * Advertise 'ofs-delta' capability in receive-pack server side capabilities. (Jelmer Vernooij) * Switched `default_local_git_client_cls` to `LocalGitClient`. (Gary van der Merwe) * Add `porcelain.ls_remote` and `GitClient.get_refs`. (Michael Edgar) * Add `Repo.discover` method. (B. M. Corser) * Add `dulwich.objectspec.parse_refspec`. (Jelmer Vernooij) * Add `porcelain.pack_objects` and `porcelain.repack`. (Jelmer Vernooij) BUG FIXES * Fix handling of 'done' in graph walker and implement the 'no-done' capability. (Tommy Yu, #88) * Avoid recursion limit issues resolving deltas. (William Grant, #81) * Allow arguments in local client binary path overrides. (Jelmer Vernooij) * Fix handling of commands with arguments in paramiko SSH client. (Andreas Klöckner, Jelmer Vernooij, #363) * Fix parsing of quoted strings in configs. (Jelmer Vernooij, #305) 0.10.1 2015-03-25 BUG FIXES * Return `ApplyDeltaError` when encountering delta errors in both C extensions and native delta application code. (Jelmer Vernooij, #259) 0.10.0 2015-03-22 BUG FIXES * In dulwich.index.build_index_from_tree, by default refuse to create entries that start with .git/. (Jelmer Vernooij, CVE-2014-9706) * Fix running of testsuite when installed. (Jelmer Vernooij, #223) * Use a block cache in _find_content_rename_candidates(), improving performance. (Mike Williams) * Add support for ``core.protectNTFS`` setting. (Jelmer Vernooij) * Fix TypeError when fetching empty updates. (Hwee Miin Koh) * Resolve delta refs when pulling into a MemoryRepo. (Max Shawabkeh, #256) * Fix handling of tags of non-commits in missing object finder. (Augie Fackler, #211) * Explicitly disable mmap on plan9 where it doesn't work. (Jeff Sickel) IMPROVEMENTS * New public method `Repo.reset_index`. (Jelmer Vernooij) * Prevent duplicate parsing of loose files in objects directory when reading. Thanks to David Keijser for the report. (Jelmer Vernooij, #231) 0.9.9 2015-03-20 SECURITY BUG FIXES * Fix buffer overflow in C implementation of pack apply_delta(). (CVE-2015-0838) Thanks to Ivan Fratric of the Google Security Team for reporting this issue. (Jelmer Vernooij) 0.9.8 2014-11-30 BUG FIXES * Various fixes to improve test suite running on Windows. (Gary van der Merwe) * Limit delta copy length to 64K in v2 pack files. (Robert Brown) * Strip newline from final ACKed SHA while fetching packs. (Michael Edgar) * Remove assignment to PyList_SIZE() that was causing segfaults on pypy. (Jelmer Vernooij, #196) IMPROVEMENTS * Add porcelain 'receive-pack' and 'upload-pack'. (Jelmer Vernooij) * Handle SIGINT signals in bin/dulwich. (Jelmer Vernooij) * Add 'status' support to bin/dulwich. (Jelmer Vernooij) * Add 'branch_create', 'branch_list', 'branch_delete' porcelain. (Jelmer Vernooij) * Add 'fetch' porcelain. (Jelmer Vernooij) * Add 'tag_delete' porcelain. (Jelmer Vernooij) * Add support for serializing/deserializing 'gpgsig' attributes in Commit. (Jelmer Vernooij) CHANGES * dul-web is now available as 'dulwich web-daemon'. (Jelmer Vernooij) * dulwich.porcelain.tag has been renamed to tag_create. dulwich.porcelain.list_tags has been renamed to tag_list. (Jelmer Vernooij) API CHANGES * Restore support for Python 2.6. (Jelmer Vernooij, Gary van der Merwe) 0.9.7 2014-06-08 BUG FIXES * Fix tests dependent on hash ordering. (Michael Edgar) * Support staging symbolic links in Repo.stage. (Robert Brown) * Ensure that all files object are closed when running the test suite. (Gary van der Merwe) * When writing OFS_DELTA pack entries, write correct offset. (Augie Fackler) * Fix handler of larger copy operations in packs. (Augie Fackler) * Various fixes to improve test suite running on Windows. (Gary van der Merwe) * Fix logic for extra adds of identical files in rename detector. (Robert Brown) IMPROVEMENTS * Add porcelain 'status'. (Ryan Faulkner) * Add porcelain 'daemon'. (Jelmer Vernooij) * Add `dulwich.greenthreads` module which provides support for concurrency of some object store operations. (Fabien Boucher) * Various changes to improve compatibility with Python 3. (Gary van der Merwe, Hannu Valtonen, michael-k) * Add OpenStack Swift backed repository implementation in dulwich.contrib. See README.swift for details. (Fabien Boucher) API CHANGES * An optional close function can be passed to the Protocol class. This will be called by its close method. (Gary van der Merwe) * All classes with close methods are now context managers, so that they can be easily closed using a `with` statement. (Gary van der Merwe) * Remove deprecated `num_objects` argument to `write_pack` methods. (Jelmer Vernooij) OTHER CHANGES * The 'dul-daemon' script has been removed. The same functionality is now available as 'dulwich daemon'. (Jelmer Vernooij) 0.9.6 2014-04-23 IMPROVEMENTS * Add support for recursive add in 'git add'. (Ryan Faulkner, Jelmer Vernooij) * Add porcelain 'list_tags'. (Ryan Faulkner) * Add porcelain 'push'. (Ryan Faulkner) * Add porcelain 'pull'. (Ryan Faulkner) * Support 'http.proxy' in HttpGitClient. (Jelmer Vernooij, #1096030) * Support 'http.useragent' in HttpGitClient. (Jelmer Vernooij) * In server, wait for clients to send empty list of wants when talking to empty repository. (Damien Tournoud) * Various changes to improve compatibility with Python 3. (Gary van der Merwe) BUG FIXES * Support unseekable 'wsgi.input' streams. (Jonas Haag) * Raise TypeError when passing unicode() object to Repo.__getitem__. (Jonas Haag) * Fix handling of `reset` command in dulwich.fastexport. (Jelmer Vernooij, #1249029) * In client, don't wait for server to close connection first. Fixes hang when used against GitHub server implementation. (Siddharth Agarwal) * DeltaChainIterator: fix a corner case where an object is inflated as an object already in the repository. (Damien Tournoud, #135) * Stop leaking file handles during pack reload. (Damien Tournoud) * Avoid reopening packs during pack cache reload. (Jelmer Vernooij) API CHANGES * Drop support for Python 2.6. (Jelmer Vernooij) 0.9.5 2014-02-23 IMPROVEMENTS * Add porcelain 'tag'. (Ryan Faulkner) * New module `dulwich.objectspec` for parsing strings referencing objects and commit ranges. (Jelmer Vernooij) * Add shallow branch support. (milki) * Allow passing urllib2 `opener` into HttpGitClient. (Dov Feldstern, #909037) CHANGES * Drop support for Python 2.4 and 2.5. (Jelmer Vernooij) API CHANGES * Remove long deprecated ``Repo.commit``, ``Repo.get_blob``, ``Repo.tree`` and ``Repo.tag``. (Jelmer Vernooij) * Remove long deprecated ``Repo.revision_history`` and ``Repo.ref``. (Jelmer Vernooij) * Remove long deprecated ``Tree.entries``. (Jelmer Vernooij) BUG FIXES * Raise KeyError rather than TypeError when passing in unicode object of length 20 or 40 to Repo.__getitem__. (Jelmer Vernooij) * Use 'rm' rather than 'unlink' in tests, since the latter does not exist on OpenBSD and other platforms. (Dmitrij D. Czarkoff) 0.9.4 2013-11-30 IMPROVEMENTS * Add ssh_kwargs attribute to ParamikoSSHVendor. (milki) * Add Repo.set_description(). (Víðir Valberg Guðmundsson) * Add a basic `dulwich.porcelain` module. (Jelmer Vernooij, Marcin Kuzminski) * Various performance improvements for object access. (Jelmer Vernooij) * New function `get_transport_and_path_from_url`, similar to `get_transport_and_path` but only supports URLs. (Jelmer Vernooij) * Add support for file:// URLs in `get_transport_and_path_from_url`. (Jelmer Vernooij) * Add LocalGitClient implementation. (Jelmer Vernooij) BUG FIXES * Support filesystems with 64bit inode and device numbers. (André Roth) CHANGES * Ref handling has been moved to dulwich.refs. (Jelmer Vernooij) API CHANGES * Remove long deprecated RefsContainer.set_ref(). (Jelmer Vernooij) * Repo.ref() is now deprecated in favour of Repo.refs[]. (Jelmer Vernooij) FEATURES * Add support for graftpoints. (milki) 0.9.3 2013-09-27 BUG FIXES * Fix path for stdint.h in MANIFEST.in. (Jelmer Vernooij) 0.9.2 2013-09-26 BUG FIXES * Include stdint.h in MANIFEST.in (Mark Mikofski) 0.9.1 2013-09-22 BUG FIXES * Support lookups of 40-character refs in BaseRepo.__getitem__. (Chow Loong Jin, Jelmer Vernooij) * Fix fetching packs with side-band-64k capability disabled. (David Keijser, Jelmer Vernooij) * Several fixes in send-pack protocol behaviour - handling of empty pack files and deletes. (milki, #1063087) * Fix capability negotiation when fetching packs over HTTP. (#1072461, William Grant) * Enforce determine_wants returning an empty list rather than None. (Fabien Boucher, Jelmer Vernooij) * In the server, support pushes just removing refs. (Fabien Boucher, Jelmer Vernooij) IMPROVEMENTS * Support passing a single revision to BaseRepo.get_walker() rather than a list of revisions.g (Alberto Ruiz) * Add `Repo.get_description` method. (Jelmer Vernooij) * Support thin packs in Pack.iterobjects() and Pack.get_raw(). (William Grant) * Add `MemoryObjectStore.add_pack` and `MemoryObjectStore.add_thin_pack` methods. (David Bennett) * Add paramiko-based SSH vendor. (Aaron O'Mullan) * Support running 'dulwich.server' and 'dulwich.web' using 'python -m'. (Jelmer Vernooij) * Add ObjectStore.close(). (Jelmer Vernooij) * Raise appropriate NotImplementedError when encountering dumb HTTP servers. (Jelmer Vernooij) API CHANGES * SSHVendor.connect_ssh has been renamed to SSHVendor.run_command. (Jelmer Vernooij) * ObjectStore.add_pack() now returns a 3-tuple. The last element will be an abort() method that can be used to cancel the pack operation. (Jelmer Vernooij) 0.9.0 2013-05-31 BUG FIXES * Push efficiency - report missing objects only. (#562676, Artem Tikhomirov) * Use indentation consistent with C Git in config files. (#1031356, Curt Moore, Jelmer Vernooij) * Recognize and skip binary files in diff function. (Takeshi Kanemoto) * Fix handling of relative paths in dulwich.client.get_transport_and_path. (Brian Visel, #1169368) * Preserve ordering of entries in configuration. (Benjamin Pollack) * Support ~ expansion in SSH client paths. (milki, #1083439) * Support relative paths in alternate paths. (milki, Michel Lespinasse, #1175007) * Log all error messages from wsgiref server to the logging module. This makes the test suit quiet again. (Gary van der Merwe) * Support passing None for empty tree in changes_from_tree. (Kevin Watters) * Support fetching empty repository in client. (milki, #1060462) IMPROVEMENTS: * Add optional honor_filemode flag to build_index_from_tree. (Mark Mikofski) * Support core/filemode setting when building trees. (Jelmer Vernooij) * Add chapter on tags in tutorial. (Ryan Faulkner) FEATURES * Add support for mergetags. (milki, #963525) * Add support for posix shell hooks. (milki) 0.8.7 2012-11-27 BUG FIXES * Fix use of alternates in ``DiskObjectStore``.{__contains__,__iter__}. (Dmitriy) * Fix compatibility with Python 2.4. (David Carr) 0.8.6 2012-11-09 API CHANGES * dulwich.__init__ no longer imports client, protocol, repo and server modules. (Jelmer Vernooij) FEATURES * ConfigDict now behaves more like a dictionary. (Adam 'Cezar' Jenkins, issue #58) * HTTPGitApplication now takes an optional `fallback_app` argument. (Jonas Haag, issue #67) * Support for large pack index files. (Jameson Nash) TESTING * Make index entry tests a little bit less strict, to cope with slightly different behaviour on various platforms. (Jelmer Vernooij) * ``setup.py test`` (available when setuptools is installed) now runs all tests, not just the basic unit tests. (Jelmer Vernooij) BUG FIXES * Commit._deserialize now actually deserializes the current state rather than the previous one. (Yifan Zhang, issue #59) * Handle None elements in lists of TreeChange objects. (Alex Holmes) * Support cloning repositories without HEAD set. (D-Key, Jelmer Vernooij, issue #69) * Support ``MemoryRepo.get_config``. (Jelmer Vernooij) * In ``get_transport_and_path``, pass extra keyword arguments on to HttpGitClient. (Jelmer Vernooij) 0.8.5 2012-03-29 BUG FIXES * Avoid use of 'with' in dulwich.index. (Jelmer Vernooij) * Be a little bit strict about OS behaviour in index tests. Should fix the tests on Debian GNU/kFreeBSD. (Jelmer Vernooij) 0.8.4 2012-03-28 BUG FIXES * Options on the same line as sections in config files are now supported. (Jelmer Vernooij, #920553) * Only negotiate capabilities that are also supported by the server. (Rod Cloutier, Risto Kankkunen) * Fix parsing of invalid timezone offsets with two minus signs. (Jason R. Coombs, #697828) * Reset environment variables during tests, to avoid test isolation leaks reading ~/.gitconfig. (Risto Kankkunen) TESTS * $HOME is now explicitly specified for tests that use it to read ``~/.gitconfig``, to prevent test isolation issues. (Jelmer Vernooij, #920330) FEATURES * Additional arguments to get_transport_and_path are now passed on to the constructor of the transport. (Sam Vilain) * The WSGI server now transparently handles when a git client submits data using Content-Encoding: gzip. (David Blewett, Jelmer Vernooij) * Add dulwich.index.build_index_from_tree(). (milki) 0.8.3 2012-01-21 FEATURES * The config parser now supports the git-config file format as described in git-config(1) and can write git config files. (Jelmer Vernooij, #531092, #768687) * ``Repo.do_commit`` will now use the user identity from .git/config or ~/.gitconfig if none was explicitly specified. (Jelmer Vernooij) BUG FIXES * Allow ``determine_wants`` methods to include the zero sha in their return value. (Jelmer Vernooij) 0.8.2 2011-12-18 BUG FIXES * Cope with different zlib buffer sizes in sha1 file parser. (Jelmer Vernooij) * Fix get_transport_and_path for HTTP/HTTPS URLs. (Bruno Renié) * Avoid calling free_objects() on NULL in error cases. (Chris Eberle) * Fix use --bare argument to 'dulwich init'. (Chris Eberle) * Properly abort connections when the determine_wants function raises an exception. (Jelmer Vernooij, #856769) * Tweak xcodebuild hack to deal with more error output. (Jelmer Vernooij, #903840) FEATURES * Add support for retrieving tarballs from remote servers. (Jelmer Vernooij, #379087) * New method ``update_server_info`` which generates data for dumb server access. (Jelmer Vernooij, #731235) 0.8.1 2011-10-31 FEATURES * Repo.do_commit has a new argument 'ref'. * Repo.do_commit has a new argument 'merge_heads'. (Jelmer Vernooij) * New ``Repo.get_walker`` method. (Jelmer Vernooij) * New ``Repo.clone`` method. (Jelmer Vernooij, #725369) * ``GitClient.send_pack`` now supports the 'side-band-64k' capability. (Jelmer Vernooij) * ``HttpGitClient`` which supports the smart server protocol over HTTP. "dumb" access is not yet supported. (Jelmer Vernooij, #373688) * Add basic support for alternates. (Jelmer Vernooij, #810429) CHANGES * unittest2 or python >= 2.7 is now required for the testsuite. testtools is no longer supported. (Jelmer Vernooij, #830713) BUG FIXES * Fix compilation with older versions of MSVC. (Martin gz) * Special case 'refs/stash' as a valid ref. (Jelmer Vernooij, #695577) * Smart protocol clients can now change refs even if they are not uploading new data. (Jelmer Vernooij, #855993) * Don't compile C extensions when running in pypy. (Ronny Pfannschmidt, #881546) * Use different name for strnlen replacement function to avoid clashing with system strnlen. (Jelmer Vernooij, #880362) API CHANGES * ``Repo.revision_history`` is now deprecated in favor of ``Repo.get_walker``. (Jelmer Vernooij) 0.8.0 2011-08-07 FEATURES * New DeltaChainIterator abstract class for quickly iterating all objects in a pack, with implementations for pack indexing and inflation. (Dave Borowitz) * New walk module with a Walker class for customizable commit walking. (Dave Borowitz) * New tree_changes_for_merge function in diff_tree. (Dave Borowitz) * Easy rename detection in RenameDetector even without find_copies_harder. (Dave Borowitz) BUG FIXES * Avoid storing all objects in memory when writing pack. (Jelmer Vernooij, #813268) * Support IPv6 for git:// connections. (Jelmer Vernooij, #801543) * Improve performance of Repo.revision_history(). (Timo Schmid, #535118) * Fix use of SubprocessWrapper on Windows. (Paulo Madeira, #670035) * Fix compilation on newer versions of Mac OS X (Lion and up). (Ryan McKern, #794543) * Prevent raising ValueError for correct refs in RefContainer.__delitem__. * Correctly return a tuple from MemoryObjectStore.get_raw. (Dave Borowitz) * Fix a bug in reading the pack checksum when there are fewer than 20 bytes left in the buffer. (Dave Borowitz) * Support ~ in git:// URL paths. (Jelmer Vernooij, #813555) * Make ShaFile.__eq__ work when other is not a ShaFile. (Dave Borowitz) * ObjectStore.get_graph_walker() now no longer yields the same revision more than once. This has a significant improvement for performance when wide revision graphs are involved. (Jelmer Vernooij, #818168) * Teach ReceivePackHandler how to read empty packs. (Dave Borowitz) * Don't send a pack with duplicates of the same object. (Dave Borowitz) * Teach the server how to serve a clone of an empty repo. (Dave Borowitz) * Correctly advertise capabilities during receive-pack. (Dave Borowitz) * Fix add/add and add/rename conflicts in tree_changes_for_merge. (Dave Borowitz) * Use correct MIME types in web server. (Dave Borowitz) API CHANGES * write_pack no longer takes the num_objects argument and requires an object to be passed in that is iterable (rather than an iterator) and that provides __len__. (Jelmer Vernooij) * write_pack_data has been renamed to write_pack_objects and no longer takes a num_objects argument. (Jelmer Vernooij) * take_msb_bytes, read_zlib_chunks, unpack_objects, and PackStreamReader.read_objects now take an additional argument indicating a crc32 to compute. (Dave Borowitz) * PackObjectIterator was removed; its functionality is still exposed by PackData.iterobjects. (Dave Borowitz) * Add a sha arg to write_pack_object to incrementally compute a SHA. (Dave Borowitz) * Include offset in PackStreamReader results. (Dave Borowitz) * Move PackStreamReader from server to pack. (Dave Borowitz) * Extract a check_length_and_checksum, compute_file_sha, and pack_object_header pack helper functions. (Dave Borowitz) * Extract a compute_file_sha function. (Dave Borowitz) * Remove move_in_thin_pack as a separate method; add_thin_pack now completes the thin pack and moves it in in one step. Remove ThinPackData as well. (Dave Borowitz) * Custom buffer size in read_zlib_chunks. (Dave Borowitz) * New UnpackedObject data class that replaces ad-hoc tuples in the return value of unpack_object and various DeltaChainIterator methods. (Dave Borowitz) * Add a lookup_path convenience method to Tree. (Dave Borowitz) * Optionally create RenameDetectors without passing in tree SHAs. (Dave Borowitz) * Optionally include unchanged entries in RenameDetectors. (Dave Borowitz) * Optionally pass a RenameDetector to tree_changes. (Dave Borowitz) * Optionally pass a request object through to server handlers. (Dave Borowitz) TEST CHANGES * If setuptools is installed, "python setup.py test" will now run the testsuite. (Jelmer Vernooij) * Add a new build_pack test utility for building packs from a simple spec. (Dave Borowitz) * Add a new build_commit_graph test utility for building commits from a simple spec. (Dave Borowitz) 0.7.1 2011-04-12 BUG FIXES * Fix double decref in _diff_tree.c. (Ted Horst, #715528) * Fix the build on Windows. (Pascal Quantin) * Fix get_transport_and_path compatibility with pre-2.6.5 versions of Python. (Max Bowsher, #707438) * BaseObjectStore.determine_wants_all no longer breaks on zero SHAs. (Jelmer Vernooij) * write_tree_diff() now supports submodules. (Jelmer Vernooij) * Fix compilation for XCode 4 and older versions of distutils.sysconfig. (Daniele Sluijters) IMPROVEMENTS * Sphinxified documentation. (Lukasz Balcerzak) * Add Pack.keep.(Marc Brinkmann) API CHANGES * The order of the parameters to Tree.add(name, mode, sha) has changed, and is now consistent with the rest of Dulwich. Existing code will still work but print a DeprecationWarning. (Jelmer Vernooij, #663550) * Tree.entries() is now deprecated in favour of Tree.items() and Tree.iteritems(). (Jelmer Vernooij) 0.7.0 2011-01-21 FEATURES * New `dulwich.diff_tree` module for simple content-based rename detection. (Dave Borowitz) * Add Tree.items(). (Jelmer Vernooij) * Add eof() and unread_pkt_line() methods to Protocol. (Dave Borowitz) * Add write_tree_diff(). (Jelmer Vernooij) * Add `serve_command` function for git server commands as executables. (Jelmer Vernooij) * dulwich.client.get_transport_and_path now supports rsync-style repository URLs. (Dave Borowitz, #568493) BUG FIXES * Correct short-circuiting operation for no-op fetches in the server. (Dave Borowitz) * Support parsing git mbox patches without a version tail, as generated by Mercurial. (Jelmer Vernooij) * Fix dul-receive-pack and dul-upload-pack. (Jelmer Vernooij) * Zero-padded file modes in Tree objects no longer trigger an exception but the check code warns about them. (Augie Fackler, #581064) * Repo.init() now honors the mkdir flag. (#671159) * The ref format is now checked when setting a ref rather than when reading it back. (Dave Borowitz, #653527) * Make sure pack files are closed correctly. (Tay Ray Chuan) DOCUMENTATION * Run the tutorial inside the test suite. (Jelmer Vernooij) * Reorganized and updated the tutorial. (Jelmer Vernooij, Dave Borowitz, #610550, #610540) 0.6.2 2010-10-16 BUG FIXES * HTTP server correctly handles empty CONTENT_LENGTH. (Dave Borowitz) * Don't error when creating GitFiles with the default mode. (Dave Borowitz) * ThinPackData.from_file now works with resolve_ext_ref callback. (Dave Borowitz) * Provide strnlen() on mingw32 which doesn't have it. (Hans Kolek) * Set bare=true in the configuration for bare repositories. (Dirk Neumann) FEATURES * Use slots for core objects to save up on memory. (Jelmer Vernooij) * Web server supports streaming progress/pack output. (Dave Borowitz) * New public function dulwich.pack.write_pack_header. (Dave Borowitz) * Distinguish between missing files and read errors in HTTP server. (Dave Borowitz) * Initial work on support for fastimport using python-fastimport. (Jelmer Vernooij) * New dulwich.pack.MemoryPackIndex class. (Jelmer Vernooij) * Delegate SHA peeling to the object store. (Dave Borowitz) TESTS * Use GitFile when modifying packed-refs in tests. (Dave Borowitz) * New tests in test_web with better coverage and fewer ad-hoc mocks. (Dave Borowitz) * Standardize quote delimiters in test_protocol. (Dave Borowitz) * Fix use when testtools is installed. (Jelmer Vernooij) * Add trivial test for write_pack_header. (Jelmer Vernooij) * Refactor some of dulwich.tests.compat.server_utils. (Dave Borowitz) * Allow overwriting id property of objects in test utils. (Dave Borowitz) * Use real in-memory objects rather than stubs for server tests. (Dave Borowitz) * Clean up MissingObjectFinder. (Dave Borowitz) API CHANGES * ObjectStore.iter_tree_contents now walks contents in depth-first, sorted order. (Dave Borowitz) * ObjectStore.iter_tree_contents can optionally yield tree objects as well. (Dave Borowitz). * Add side-band-64k support to ReceivePackHandler. (Dave Borowitz) * Change server capabilities methods to classmethods. (Dave Borowitz) * Tweak server handler injection. (Dave Borowitz) * PackIndex1 and PackIndex2 now subclass FilePackIndex, which isg itself a subclass of PackIndex. (Jelmer Vernooij) DOCUMENTATION * Add docstrings for various functions in dulwich.objects. (Jelmer Vernooij) * Clean up docstrings in dulwich.protocol. (Dave Borowitz) * Explicitly specify allowed protocol commands to ProtocolGraphWalker.read_proto_line. (Dave Borowitz) * Add utility functions to DictRefsContainer. (Dave Borowitz) 0.6.1 2010-07-22 BUG FIXES * Fix memory leak in C implementation of sorted_tree_items. (Dave Borowitz) * Use correct path separators for named repo files. (Dave Borowitz) * python > 2.7 and testtools-based test runners will now also pick up skipped tests correctly. (Jelmer Vernooij) FEATURES * Move named file initialization to BaseRepo. (Dave Borowitz) * Add logging utilities and git/HTTP server logging. (Dave Borowitz) * The GitClient interface has been cleaned up and instances are now reusable. (Augie Fackler) * Allow overriding paths to executables in GitSSHClient.g (Ross Light, Jelmer Vernooij, #585204) * Add PackBasedObjectStore.pack_loose_objects(). (Jelmer Vernooij) TESTS * Add tests for sorted_tree_items and C implementation. (Dave Borowitz) * Add a MemoryRepo that stores everything in memory. (Dave Borowitz) * Quiet logging output from web tests. (Dave Borowitz) * More flexible version checking for compat tests. (Dave Borowitz) * Compat tests for servers with and without side-band-64k. (Dave Borowitz) CLEANUP * Clean up file headers. (Dave Borowitz) TESTS * Use GitFile when modifying packed-refs in tests. (Dave Borowitz) API CHANGES * dulwich.pack.write_pack_index_v{1,2} now take a file-like object rather than a filename. (Jelmer Vernooij) * Make dul-daemon/dul-web trivial wrappers around server functionality. (Dave Borowitz) * Move reference WSGI handler to web.py. (Dave Borowitz) * Factor out _report_status in ReceivePackHandler. (Dave Borowitz) * Factor out a function to convert a line to a pkt-line. (Dave Borowitz) 0.6.0 2010-05-22 note: This list is most likely incomplete for 0.6.0. BUG FIXES g * Fix ReceivePackHandler to disallow removing refs without delete-refs. (Dave Borowitz) * Deal with capabilities required by the client, even if theyg can not be disabled in the server. (Dave Borowitz) * Fix trailing newlines in generated patch files. (Jelmer Vernooij) * Implement RefsContainer.__contains__. (Jelmer Vernooij) * Cope with \r in ref files on Windows. ( http://github.com/jelmer/dulwich/issues/#issue/13, Jelmer Vernooij) * Fix GitFile breakage on Windows. (Anatoly Techtonik, #557585) * Support packed ref deletion with no peeled refs. (Augie Fackler) * Fix send pack when there is nothing to fetch. (Augie Fackler) * Fix fetch if no progress function is specified. (Augie Fackler) * Allow double-staging of files that are deleted in the index.g (Dave Borowitz) * Fix RefsContainer.add_if_new to support dangling symrefs. (Dave Borowitz) * Non-existent index files in non-bare repositories are now treated asg empty. (Dave Borowitz) * Always update ShaFile.id when the contents of the object get changed.g (Jelmer Vernooij) * Various Python2.4-compatibility fixes. (Dave Borowitz) * Fix thin pack handling. (Dave Borowitz) g FEATURES * Add include-tag capability to server. (Dave Borowitz) * New dulwich.fastexport module that can generate fastexportg streams. (Jelmer Vernooij) * Implemented BaseRepo.__contains__. (Jelmer Vernooij) * Add __setitem__ to DictRefsContainer. (Dave Borowitz) * Overall improvements checking Git objects. (Dave Borowitz) * Packs are now verified while they are received. (Dave Borowitz) TESTS * Add framework for testing compatibility with C Git. (Dave Borowitz) * Add various tests for the use of non-bare repositories. (Dave Borowitz) * Cope with diffstat not being available on all platforms.g (Tay Ray Chuan, Jelmer Vernooij) * Add make_object and make_commit convenience functions to test utils. (Dave Borowitz) API BREAKAGES * The 'committer' and 'message' arguments to Repo.do_commit() haveg been swapped. 'committer' is now optional. (Jelmer Vernooij) * Repo.get_blob, Repo.commit, Repo.tag and Repo.tree are now deprecated. (Jelmer Vernooij) * RefsContainer.set_ref() was renamed to RefsContainer.set_symbolic_ref(), for clarity. (Jelmer Vernooij) API CHANGES * The primary serialization APIs in dulwich.objects now workg with chunks of strings rather than with full-text strings.g (Jelmer Vernooij) 0.5.02010-03-03 BUG FIXES * Support custom fields in commits (readonly). (Jelmer Vernooij) * Improved ref handling. (Dave Borowitz) * Rework server protocol to be smarter and interoperate with cgit client. (Dave Borowitz) * Add a GitFile class that uses the same locking protocol for writes asg cgit. (Dave Borowitz) * Cope with forward slashes correctly in the index on Windows. (Jelmer Vernooij, #526793) FEATURES * --pure option to setup.py to allow building/installing without the Cg extensions. (Hal Wine, Anatoly Techtonik, Jelmer Vernooij, #434326) * Implement Repo.get_config(). (Jelmer Vernooij, Augie Fackler) * HTTP dumb and smart server. (Dave Borowitz) * Add abstract baseclass for Repo that does not require file systemg operations. (Dave Borowitz) 0.4.1 2010-01-03 FEATURES * Add ObjectStore.iter_tree_contents(). (Jelmer Vernooij) * Add Index.changes_from_tree(). (Jelmer Vernooij) * Add ObjectStore.tree_changes(). (Jelmer Vernooij) * Add functionality for writing patches in dulwich.patch. (Jelmer Vernooij) 0.4.0 2009-10-07 DOCUMENTATION * Added tutorial. API CHANGES * dulwich.object_store.tree_lookup_path will now return the mode andg sha of the object found rather than the object itself. BUG FIXES * Use binascii.hexlify / binascii.unhexlify for better performance. * Cope with extra unknown data in index files by ignoring it (for now). * Add proper error message when server unexpectedly hangs up. (#415843) * Correctly write opcode for equal in create_delta. 0.3.3 2009-07-23 FEATURES * Implement ShaFile.__hash__(). * Implement Tree.__len__() BUG FIXES g * Check for 'objects' and 'refs' directories when looking for a Git repository. (#380818) 0.3.2 2009-05-20 BUG FIXES * Support the encoding field in Commits. g * Some Windows compatibility fixes. * Fixed several issues in commit support. FEATURES * Basic support for handling submodules. 0.3.1 2009-05-13 FEATURES * Implemented Repo.__getitem__, Repo.__setitem__ and Repo.__delitem__ tog access content. API CHANGES * Removed Repo.set_ref, Repo.remove_ref, Repo.tags, Repo.get_refs andg Repo.heads in favor of Repo.refs, a dictionary-like object for accessing refs. BUG FIXES * Removed import of 'sha' module in objects.py, which was causingg deprecation warnings on Python 2.6. 0.3.0 2009-05-10 FEATURES * A new function 'commit_tree' has been added that can commit a treeg based on an index. BUG FIXES * The memory usage when generating indexes has been significantly reduced. g * A memory leak in the C implementation of parse_tree has been fixed. * The send-pack smart server command now works. (Thanks Scott Chacon) * The handling of short timestamps (less than 10 digits) has been fixed. * The handling of timezones has been fixed. 0.2.1 2009-04-30 BUG FIXES * Fix compatibility with Python2.4. 0.2.0 2009-04-30 FEATURES * Support for activity reporting in smart protocol client. * Optional C extensions for better performance in a couple ofg places that are performance-critical. 0.1.1 2009-03-13 BUG FIXES * Fixed regression in Repo.find_missing_objects() * Don't fetch ^{} objects from remote hosts, as requesting themg causes a hangup. * Always write pack to disk completely before calculating checksum. FEATURES * Allow disabling thin packs when talking to remote hosts. 0.1.0 2009-01-24 * Initial release. dulwich-1.0.0/README.rst000066400000000000000000000064451513301442600146460ustar00rootroot00000000000000Dulwich ======= This is the Dulwich project. It aims to provide an interface to git repos (both local and remote) that doesn't call out to git directly but instead uses pure Python. **Main website**: **License**: Apache License, version 2 or GNU General Public License, version 2 or later. SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later The project is named after the part of London that Mr. and Mrs. Git live in the particular Monty Python sketch. Differences with other Python Git libraries ------------------------------------------- Unlike other Python Git libraries, Dulwich is available as a standalone package that doesn't depend on git (like GitPython) being installed or any native code (like pygit2). This comes at the cost of speed, but makes it easier to deploy in environments where git isn't available or where it's important to have a pure Python implementation. To improve performance, Dulwich includes optional Rust bindings that can be used to speed up low-level operations. Installation ------------ By default, Dulwich' setup.py will attempt to build and install the optional Rust extensions. The reason for this is that they significantly improve the performance since some low-level operations that are executed often are much slower in CPython. If you don't want to install the Rust bindings, specify the --pure argument to setup.py:: $ python setup.py --pure install or if you are installing from pip:: $ pip install --no-binary dulwich dulwich --config-settings "--build-option=--pure" Note that you can also specify --build-option in a `requirements.txt `_ file, e.g. like this:: dulwich --config-settings "--build-option=--pure" Getting started --------------- Dulwich comes with both a lower-level API and higher-level plumbing ("porcelain"). For example, to use the lower level API to access the commit message of the last commit:: >>> from dulwich.repo import Repo >>> r = Repo('.') >>> r.head() '57fbe010446356833a6ad1600059d80b1e731e15' >>> c = r[r.head()] >>> c >>> c.message 'Add note about encoding.\n' And to print it using porcelain:: >>> from dulwich import porcelain >>> porcelain.log('.', max_entries=1) -------------------------------------------------- commit: 57fbe010446356833a6ad1600059d80b1e731e15 Author: Jelmer Vernooij Date: Sat Apr 29 2017 23:57:34 +0000 Add note about encoding. Further documentation --------------------- The dulwich documentation can be found in docs/ and built by running ``make doc``. It can also be found `on the web `_. Help ---- There is a *#dulwich* IRC channel on the `OFTC `_, and a `dulwich-discuss `_ mailing list. Contributing ------------ For a full list of contributors, see the git logs. If you'd like to contribute to Dulwich, see the `CONTRIBUTING `_ file and `list of open issues `_. Supported versions of Python ---------------------------- At the moment, Dulwich supports (and is tested on) CPython 3.10 and later and Pypy. dulwich-1.0.0/SECURITY.md000066400000000000000000000005221513301442600147360ustar00rootroot00000000000000# Security Policy ## Supported Versions | Version | Supported | | -------- | ------------------ | | 0.24.x | :white_check_mark: | | < 0.24.x | :x: | ## Reporting a Vulnerability Please report security issues by e-mail to jelmer@jelmer.uk, ideally PGP encrypted to the key at dulwich-1.0.0/TODO000066400000000000000000000002001513301442600136260ustar00rootroot00000000000000- 'git annotate' equivalent - reflog handling Performance: - more efficient pack iteration - more efficient delta generation dulwich-1.0.0/bin/000077500000000000000000000000001513301442600137165ustar00rootroot00000000000000dulwich-1.0.0/bin/dul-receive-pack000077500000000000000000000023601513301442600167650ustar00rootroot00000000000000#!/usr/bin/python3 -u # dul-receive-pack - git-receive-pack in python # Copyright (C) 2008 John Carr # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # import os import sys from dulwich.porcelain import receive_pack if len(sys.argv) < 2: sys.stderr.write("usage: %s \n" % os.path.basename(sys.argv[0])) sys.exit(1) sys.exit(receive_pack(sys.argv[1])) dulwich-1.0.0/bin/dul-upload-pack000077500000000000000000000023541513301442600166320ustar00rootroot00000000000000#!/usr/bin/python3 -u # dul-upload-pack - git-upload-pack in python # Copyright (C) 2008 John Carr # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # import os import sys from dulwich.porcelain import upload_pack if len(sys.argv) < 2: sys.stderr.write("usage: %s \n" % os.path.basename(sys.argv[0])) sys.exit(1) sys.exit(upload_pack(sys.argv[1])) dulwich-1.0.0/bin/dulwich000077500000000000000000000021171513301442600153040ustar00rootroot00000000000000#!/usr/bin/python3 -u # command-line interface for Dulwich # Copyright (C) 2020 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # import sys from dulwich.cli import main sys.exit(main(sys.argv[1:])) dulwich-1.0.0/crates/000077500000000000000000000000001513301442600144275ustar00rootroot00000000000000dulwich-1.0.0/crates/diff-tree/000077500000000000000000000000001513301442600162745ustar00rootroot00000000000000dulwich-1.0.0/crates/diff-tree/Cargo.toml000066400000000000000000000002731513301442600202260ustar00rootroot00000000000000[package] name = "diff-tree-py" version = { workspace = true } edition = "2021" [lib] crate-type = ["cdylib"] [dependencies] pyo3 = { workspace = true, features = ["extension-module"]} dulwich-1.0.0/crates/diff-tree/src/000077500000000000000000000000001513301442600170635ustar00rootroot00000000000000dulwich-1.0.0/crates/diff-tree/src/lib.rs000066400000000000000000000141671513301442600202100ustar00rootroot00000000000000/* * Copyright (C) 2010 Google, Inc. * Copyright (C) 2024 Jelmer Vernooij * * Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU * General Public License as published by the Free Software Foundation; version 2.0 * or (at your option) any later version. You can redistribute it and/or * modify it under the terms of either of these two licenses. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * You should have received a copy of the licenses; if not, see * for a copy of the GNU General Public License * and for a copy of the Apache * License, Version 2.0. */ use pyo3::prelude::*; use pyo3::exceptions::PyTypeError; use pyo3::types::{PyBytes, PyList, PyTuple}; use pyo3::Python; use std::cmp::Ordering; const S_IFMT: u32 = 0o170000; const S_IFDIR: u32 = 0o040000; fn add_hash(get: &Bound, set: &Bound, string: &[u8], py: Python) -> PyResult<()> { let str_obj = PyBytes::new(py, string); let hash_obj = str_obj.hash()?; let value = get.call1((hash_obj,))?; let n = string.len(); set.call1((hash_obj, value.extract::()? + n))?; Ok(()) } #[pyfunction] fn _count_blocks(py: Python, obj: &Bound) -> PyResult> { let default_dict_cls = PyModule::import(py, "collections")?.getattr("defaultdict")?; let int_cls = PyModule::import(py, "builtins")?.getattr("int")?; let counts = default_dict_cls.call1((int_cls,))?; let get = counts.getattr("__getitem__")?; let set = counts.getattr("__setitem__")?; let chunks = obj.call_method0("as_raw_chunks")?; if !chunks.is_instance_of::() { return Err(PyTypeError::new_err( "as_raw_chunks() did not return a list", )); } let num_chunks = chunks.extract::>>()?.len(); let pym = py.import("dulwich.diff_tree")?; let block_size = pym.getattr("_BLOCK_SIZE")?.extract::()?; let mut block: Vec = Vec::with_capacity(block_size); for i in 0..num_chunks { let chunk = chunks.get_item(i)?; if !chunk.is_instance_of::() { return Err(PyTypeError::new_err("chunk is not a string")); } let chunk_str = chunk.extract::<&[u8]>()?; for c in chunk_str { block.push(*c); if *c == b'\n' || block.len() == block_size { add_hash(&get, &set, &block, py)?; block.clear(); } } } if !block.is_empty() { add_hash(&get, &set, &block, py)?; } Ok(counts.into_pyobject(py).unwrap().into()) } #[pyfunction] fn _is_tree(_py: Python, entry: &Bound) -> PyResult { if entry.is_none() { return Ok(false); } let mode = entry.getattr("mode")?; if mode.is_none() { Ok(false) } else { let lmode = mode.extract::()?; Ok((lmode & S_IFMT) == S_IFDIR) } } fn tree_entries(path: &[u8], tree: &Bound, py: Python) -> PyResult>> { if tree.is_none() { return Ok(Vec::new()); } let dom = py.import("dulwich.objects")?; let tree_entry_cls = dom.getattr("TreeEntry")?; let items = tree .call_method1("iteritems", (true,))? .extract::>>()?; let mut result = Vec::new(); for item in items { let (name, mode, sha) = item.extract::<(Vec, u32, Py)>(py)?; let mut new_path = Vec::with_capacity(path.len() + name.len() + 1); if !path.is_empty() { new_path.extend_from_slice(path); new_path.push(b'/'); } new_path.extend_from_slice(name.as_slice()); let tree_entry = tree_entry_cls.call1((PyBytes::new(py, &new_path), mode, sha))?; result.push(tree_entry.into_pyobject(py).unwrap().into()); } Ok(result) } fn entry_path_cmp(entry1: &Bound, entry2: &Bound) -> PyResult { let path1_o = entry1.getattr("path")?; let path1 = path1_o.extract::<&[u8]>()?; let path2_o = entry2.getattr("path")?; let path2 = path2_o.extract::<&[u8]>()?; Ok(path1.cmp(path2)) } #[pyfunction] fn _merge_entries( py: Python, path: &[u8], tree1: &Bound, tree2: &Bound, ) -> PyResult> { let entries1 = tree_entries(path, tree1, py)?; let entries2 = tree_entries(path, tree2, py)?; let mut result = Vec::new(); let mut i1 = 0; let mut i2 = 0; while i1 < entries1.len() && i2 < entries2.len() { let cmp = entry_path_cmp(entries1[i1].bind(py), entries2[i2].bind(py))?; let (e1, e2) = match cmp { Ordering::Equal => (entries1[i1].clone_ref(py), entries2[i2].clone_ref(py)), Ordering::Less => (entries1[i1].clone_ref(py), py.None()), Ordering::Greater => (py.None(), entries2[i2].clone_ref(py)), }; let pair = PyTuple::new(py, &[e1, e2]).unwrap(); result.push(pair); match cmp { Ordering::Equal => { i1 += 1; i2 += 1; } Ordering::Less => { i1 += 1; } Ordering::Greater => { i2 += 1; } } } while i1 < entries1.len() { let pair = PyTuple::new(py, &[entries1[i1].clone_ref(py), py.None()]).unwrap(); result.push(pair); i1 += 1; } while i2 < entries2.len() { let pair = PyTuple::new(py, &[py.None(), entries2[i2].clone_ref(py)]).unwrap(); result.push(pair); i2 += 1; } Ok(PyList::new(py, &result).unwrap().unbind().into()) } #[pymodule] fn _diff_tree(_py: Python, m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(_count_blocks, m)?)?; m.add_function(wrap_pyfunction!(_is_tree, m)?)?; m.add_function(wrap_pyfunction!(_merge_entries, m)?)?; Ok(()) } dulwich-1.0.0/crates/objects/000077500000000000000000000000001513301442600160605ustar00rootroot00000000000000dulwich-1.0.0/crates/objects/Cargo.toml000066400000000000000000000003061513301442600200070ustar00rootroot00000000000000[package] name = "objects-py" version = { workspace = true } edition = "2021" [lib] crate-type = ["cdylib"] [dependencies] pyo3 = { workspace = true, features = ["extension-module"]} memchr = "2" dulwich-1.0.0/crates/objects/src/000077500000000000000000000000001513301442600166475ustar00rootroot00000000000000dulwich-1.0.0/crates/objects/src/lib.rs000066400000000000000000000132341513301442600177660ustar00rootroot00000000000000/* * Copyright (C) 2009 Jelmer Vernooij * * Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU * General Public License as published by the Free Software Foundation; version 2.0 * or (at your option) any later version. You can redistribute it and/or * modify it under the terms of either of these two licenses. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * You should have received a copy of the licenses; if not, see * for a copy of the GNU General Public License * and for a copy of the Apache * License, Version 2.0. */ use memchr::memchr; use pyo3::exceptions::PyTypeError; use pyo3::import_exception; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyDict}; import_exception!(dulwich.errors, ObjectFormatException); const S_IFDIR: u32 = 0o40000; const S_IFMT: u32 = 0o170000; // File type mask #[inline] fn bytehex(byte: u8) -> u8 { match byte { 0..=9 => byte + b'0', 10..=15 => byte - 10 + b'a', _ => unreachable!(), } } fn sha_to_pyhex(py: Python, sha: &[u8]) -> PyResult> { let mut hexsha = Vec::new(); for c in sha { hexsha.push(bytehex((c & 0xF0) >> 4)); hexsha.push(bytehex(c & 0x0F)); } Ok(PyBytes::new(py, hexsha.as_slice()).into()) } #[pyfunction] #[pyo3(signature = (text, sha_len, strict=None))] fn parse_tree( py: Python, mut text: &[u8], sha_len: usize, strict: Option, ) -> PyResult, u32, Py)>> { let strict = strict.unwrap_or(false); let mut entries = Vec::new(); while !text.is_empty() { let mode_end = memchr(b' ', text) .ok_or_else(|| ObjectFormatException::new_err(("Missing terminator for mode",)))?; let text_str = String::from_utf8_lossy(&text[..mode_end]).to_string(); let mode = u32::from_str_radix(text_str.as_str(), 8) .map_err(|e| ObjectFormatException::new_err((format!("invalid mode: {}", e),)))?; if strict && text[0] == b'0' { return Err(ObjectFormatException::new_err(( "Illegal leading zero on mode", ))); } text = &text[mode_end + 1..]; let namelen = memchr(b'\0', text) .ok_or_else(|| ObjectFormatException::new_err(("Missing trailing \\0",)))?; let name = &text[..namelen]; // Skip name and null terminator text = &text[namelen + 1..]; // Check if we have enough bytes for the hash if text.len() < sha_len { return Err(ObjectFormatException::new_err(("SHA truncated",))); } let sha = &text[..sha_len]; entries.push(( PyBytes::new(py, name).into_pyobject(py)?.unbind().into(), mode, sha_to_pyhex(py, sha)?, )); text = &text[sha_len..]; } Ok(entries) } fn cmp_with_suffix(a: (u32, &[u8]), b: (u32, &[u8])) -> std::cmp::Ordering { let len = std::cmp::min(a.1.len(), b.1.len()); let cmp = a.1[..len].cmp(&b.1[..len]); if cmp != std::cmp::Ordering::Equal { return cmp; } let c1 = a.1.get(len) .map_or_else(|| if (a.0 & S_IFMT) == S_IFDIR { b'/' } else { 0 }, |&c| c); let c2 = b.1.get(len) .map_or_else(|| if (b.0 & S_IFMT) == S_IFDIR { b'/' } else { 0 }, |&c| c); c1.cmp(&c2) } /// Iterate over a tree entries dictionary. /// /// # Arguments /// /// name_order: If True, iterate entries in order of their name. If /// False, iterate entries in tree order, that is, treat subtree entries as /// having '/' appended. /// entries: Dictionary mapping names to (mode, sha) tuples /// /// # Returns: Iterator over (name, mode, hexsha) #[pyfunction] fn sorted_tree_items( py: Python, entries: &Bound, name_order: bool, ) -> PyResult>> { let mut qsort_entries = entries .iter() .map(|(name, value)| -> PyResult<(Vec, u32, Vec)> { let value = value .extract::<(u32, Vec)>() .map_err(|e| PyTypeError::new_err((format!("invalid type: {}", e),)))?; Ok((name.extract::>().unwrap(), value.0, value.1)) }) .collect::, u32, Vec)>>>()?; if name_order { qsort_entries.sort_by(|a, b| a.0.cmp(&b.0)); } else { qsort_entries.sort_by(|a, b| cmp_with_suffix((a.1, a.0.as_slice()), (b.1, b.0.as_slice()))); } let objectsm = py.import("dulwich.objects")?; let tree_entry_cls = objectsm.getattr("TreeEntry")?; qsort_entries .into_iter() .map(|(name, mode, hexsha)| -> PyResult> { Ok(tree_entry_cls .call1(( PyBytes::new(py, name.as_slice()) .into_pyobject(py)? .unbind() .into_any(), mode, PyBytes::new(py, hexsha.as_slice()) .into_pyobject(py)? .unbind() .into_any(), ))? .unbind()) }) .collect::>>>() } #[pymodule] fn _objects(_py: Python, m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(sorted_tree_items, m)?)?; m.add_function(wrap_pyfunction!(parse_tree, m)?)?; Ok(()) } dulwich-1.0.0/crates/pack/000077500000000000000000000000001513301442600153455ustar00rootroot00000000000000dulwich-1.0.0/crates/pack/Cargo.toml000066400000000000000000000003211513301442600172710ustar00rootroot00000000000000[package] name = "pack-py" version = { workspace = true } edition = "2021" [lib] crate-type = ["cdylib"] [dependencies] pyo3 = { workspace = true, features = ["extension-module"]} memchr = "2" similar = "2" dulwich-1.0.0/crates/pack/src/000077500000000000000000000000001513301442600161345ustar00rootroot00000000000000dulwich-1.0.0/crates/pack/src/lib.rs000066400000000000000000000525761513301442600172670ustar00rootroot00000000000000/* * Copyright (C) 2009 Jelmer Vernooij * * Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU * General Public License as published by the Free Software Foundation; version 2.0 * or (at your option) any later version. You can redistribute it and/or * modify it under the terms of either of these two licenses. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * You should have received a copy of the licenses; if not, see * for a copy of the GNU General Public License * and for a copy of the Apache * License, Version 2.0. */ // Allow PyO3 macro-generated interior mutable constants #![allow(clippy::declare_interior_mutable_const)] use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyList}; pyo3::import_exception!(dulwich.errors, ApplyDeltaError); fn py_is_sha(sha: &Py, py: Python) -> PyResult { // Check if the object is a bytes object if sha.bind(py).is_instance_of::() { // Check if the bytes object has a size of 20 (SHA1) or 32 (SHA256) let len = sha.extract::<&[u8]>(py)?.len(); if len == 20 || len == 32 { Ok(true) } else { Ok(false) } } else { Ok(false) } } #[pyfunction] fn bisect_find_sha( py: Python, start: i32, end: i32, sha: Py, unpack_name: Py, ) -> PyResult> { // Convert sha_obj to a byte slice let sha = sha.as_bytes(py); let sha_len = sha.len(); // Check if sha is 20 bytes (SHA1) or 32 bytes (SHA256) if sha_len != 20 && sha_len != 32 { return Err(PyValueError::new_err( "Sha must be 20 (SHA1) or 32 (SHA256) bytes long", )); } // Check if start > end if start > end { return Err(PyValueError::new_err("start > end")); } // Binary search loop let mut start = start; let mut end = end; loop { if start > end { break; } let i = (start + end) / 2; let file_sha = unpack_name.call1(py, (i,))?; if !py_is_sha(&file_sha, py)? { return Err(PyTypeError::new_err("unpack_name returned non-sha object")); } match file_sha.extract::<&[u8]>(py).unwrap().cmp(sha) { std::cmp::Ordering::Less => { start = i + 1; } std::cmp::Ordering::Greater => { end = i - 1; } std::cmp::Ordering::Equal => { return Ok(Some(i)); } } } Ok(None) } fn get_delta_header_size(delta: &[u8], index: &mut usize, length: usize) -> usize { let mut size: usize = 0; let mut i: usize = 0; while *index < length { let cmd = delta[*index]; *index += 1; size |= ((cmd & !0x80) as usize) << i; i += 7; if cmd & 0x80 == 0 { break; } } size } fn py_chunked_as_string<'a>( py: Python<'a>, py_buf: &'a Py, ) -> PyResult> { if let Ok(py_list) = py_buf.extract::>(py) { let mut buf = Vec::new(); for chunk in py_list.iter() { if let Ok(chunk) = chunk.extract::<&[u8]>() { buf.extend_from_slice(chunk); } else if let Ok(chunk) = chunk.extract::>() { buf.extend(chunk); } else { return Err(PyTypeError::new_err(format!( "chunk is not a byte string, but a {:?}", chunk.get_type().name() ))); } } Ok(buf.into()) } else if py_buf.extract::>(py).is_ok() { Ok(std::borrow::Cow::Borrowed(py_buf.extract::<&[u8]>(py)?)) } else { Err(PyTypeError::new_err( "buf is not a string or a list of chunks", )) } } #[pyfunction] fn apply_delta(py: Python, py_src_buf: Py, py_delta: Py) -> PyResult>> { let src_buf = py_chunked_as_string(py, &py_src_buf)?; let delta = py_chunked_as_string(py, &py_delta)?; let src_buf_len = src_buf.len(); let delta_len = delta.len(); let mut index = 0; let src_size = get_delta_header_size(delta.as_ref(), &mut index, delta_len); if src_size != src_buf_len { return Err(ApplyDeltaError::new_err(format!( "Unexpected source buffer size: {} vs {}", src_size, src_buf_len ))); } let dest_size = get_delta_header_size(delta.as_ref(), &mut index, delta_len); let mut out = vec![0; dest_size]; let mut outindex = 0; while index < delta_len { let cmd = delta[index]; index += 1; if cmd & 0x80 != 0 { let mut cp_off = 0; let mut cp_size = 0; for i in 0..4 { if cmd & (1 << i) != 0 { let x = delta[index] as usize; index += 1; cp_off |= x << (i * 8); } } for i in 0..3 { if cmd & (1 << (4 + i)) != 0 { let x = delta[index] as usize; index += 1; cp_size |= x << (i * 8); } } if cp_size == 0 { cp_size = 0x10000; } // Check for overflow and bounds if cp_size > src_size || cp_off > src_size || cp_off > src_size - cp_size || cp_size > dest_size { break; } out[outindex..outindex + cp_size].copy_from_slice(&src_buf[cp_off..cp_off + cp_size]); outindex += cp_size; } else if cmd != 0 { if (cmd as usize) > dest_size { break; } // Raise ApplyDeltaError if there are more bytes to copy than space if outindex + cmd as usize > dest_size { return Err(ApplyDeltaError::new_err("Not enough space to copy")); } out[outindex..outindex + cmd as usize] .copy_from_slice(&delta[index..index + cmd as usize]); outindex += cmd as usize; index += cmd as usize; } else { return Err(ApplyDeltaError::new_err("Invalid opcode 0")); } } if index != delta_len { return Err(ApplyDeltaError::new_err("delta not empty")); } if outindex != dest_size { return Err(ApplyDeltaError::new_err("dest size incorrect")); } Ok(vec![PyBytes::new(py, &out).into()]) } /// Encode a size value for delta headers using variable-length encoding. /// This matches Python's _delta_encode_size function. fn delta_encode_size(mut size: usize) -> Vec { let mut ret = Vec::new(); let mut c = (size & 0x7F) as u8; size >>= 7; while size > 0 { ret.push(c | 0x80); c = (size & 0x7F) as u8; size >>= 7; } ret.push(c); ret } /// The length of delta compression copy operations in version 2 packs is limited /// to 64K. To copy more, we use several copy operations. const MAX_COPY_LEN: usize = 0xFFFF; /// Encode a copy operation for the delta format. /// This matches Python's _encode_copy_operation function. fn encode_copy_operation(start: usize, length: usize) -> Vec { let mut scratch = vec![0x80u8]; // Encode offset (4 bytes max) for i in 0..4 { if start & (0xFF << (i * 8)) != 0 { scratch.push(((start >> (i * 8)) & 0xFF) as u8); scratch[0] |= 1 << i; } } // Encode length (2 bytes for version 2 packs) for i in 0..2 { if length & (0xFF << (i * 8)) != 0 { scratch.push(((length >> (i * 8)) & 0xFF) as u8); scratch[0] |= 1 << (4 + i); } } scratch } /// Create a delta that transforms base_buf into target_buf. /// This uses the similar crate to find matching sequences, similar to /// Python's difflib.SequenceMatcher. fn create_delta_internal(base_buf: &[u8], target_buf: &[u8]) -> Vec { let mut result = Vec::new(); // Write delta header result.extend(delta_encode_size(base_buf.len())); result.extend(delta_encode_size(target_buf.len())); // Use similar crate to compute the diff at byte level let ops = similar::capture_diff_slices(similar::Algorithm::Myers, base_buf, target_buf); let mut old_pos = 0; let mut new_pos = 0; for op in ops { match op { similar::DiffOp::Equal { old_index, new_index, len, } => { // Sanity check assert_eq!(old_index, old_pos); assert_eq!(new_index, new_pos); // Emit copy operations from base_buf let mut copy_start = old_index; let mut copy_len = len; while copy_len > 0 { let to_copy = copy_len.min(MAX_COPY_LEN); result.extend(encode_copy_operation(copy_start, to_copy)); copy_start += to_copy; copy_len -= to_copy; } old_pos += len; new_pos += len; } similar::DiffOp::Delete { old_index, old_len, .. } => { // Git delta format doesn't care about deletes from base assert_eq!(old_index, old_pos); old_pos += old_len; } similar::DiffOp::Insert { new_index, new_len, .. } => { // Emit literal data from target_buf assert_eq!(new_index, new_pos); let data = &target_buf[new_index..new_index + new_len]; let mut remaining = data.len(); let mut offset = 0; while remaining > 0 { let chunk_size = remaining.min(127); result.push(chunk_size as u8); result.extend_from_slice(&data[offset..offset + chunk_size]); offset += chunk_size; remaining -= chunk_size; } new_pos += new_len; } similar::DiffOp::Replace { old_index, old_len, new_index, new_len, } => { // For replace operations, we delete from old and insert from new // Git delta format doesn't care about deletes, so just emit insert assert_eq!(old_index, old_pos); assert_eq!(new_index, new_pos); let data = &target_buf[new_index..new_index + new_len]; let mut remaining = data.len(); let mut offset = 0; while remaining > 0 { let chunk_size = remaining.min(127); result.push(chunk_size as u8); result.extend_from_slice(&data[offset..offset + chunk_size]); offset += chunk_size; remaining -= chunk_size; } old_pos += old_len; new_pos += new_len; } } } result } #[pyfunction] fn create_delta( py: Python, py_base_buf: Py, py_target_buf: Py, ) -> PyResult> { let base_buf = py_chunked_as_string(py, &py_base_buf)?; let target_buf = py_chunked_as_string(py, &py_target_buf)?; let delta = create_delta_internal(base_buf.as_ref(), target_buf.as_ref()); Ok(PyBytes::new(py, &delta).into()) } #[cfg(test)] mod tests { use super::*; #[test] fn test_delta_encode_size_zero() { assert_eq!(delta_encode_size(0), vec![0]); } #[test] fn test_delta_encode_size_small() { // Values that fit in 7 bits (0-127) assert_eq!(delta_encode_size(1), vec![1]); assert_eq!(delta_encode_size(127), vec![127]); } #[test] fn test_delta_encode_size_medium() { // Values that need 2 bytes (128-16383) assert_eq!(delta_encode_size(128), vec![0x80, 0x01]); assert_eq!(delta_encode_size(256), vec![0x80, 0x02]); assert_eq!(delta_encode_size(16383), vec![0xFF, 0x7F]); } #[test] fn test_delta_encode_size_large() { // Values that need 3 bytes (16384-2097151) assert_eq!(delta_encode_size(16384), vec![0x80, 0x80, 0x01]); assert_eq!(delta_encode_size(65536), vec![0x80, 0x80, 0x04]); } #[test] fn test_delta_encode_size_very_large() { // Values that need 4+ bytes assert_eq!(delta_encode_size(1048576), vec![0x80, 0x80, 0x40]); // 1MB = 2^20 assert_eq!(delta_encode_size(16777216), vec![0x80, 0x80, 0x80, 0x08]); // 16MB = 2^24 } #[test] fn test_get_delta_header_size_basic() { // Test decoding various encoded sizes let mut index = 0; let delta = vec![0x00]; assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 0); assert_eq!(index, 1); let mut index = 0; let delta = vec![0x01]; assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 1); assert_eq!(index, 1); let mut index = 0; let delta = vec![127]; assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 127); assert_eq!(index, 1); } #[test] fn test_get_delta_header_size_multibyte() { // Test decoding multi-byte sizes let mut index = 0; let delta = vec![0x80, 0x01]; assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 128); assert_eq!(index, 2); let mut index = 0; let delta = vec![0x80, 0x02]; assert_eq!(get_delta_header_size(&delta, &mut index, delta.len()), 256); assert_eq!(index, 2); let mut index = 0; let delta = vec![0x80, 0x80, 0x01]; assert_eq!( get_delta_header_size(&delta, &mut index, delta.len()), 16384 ); assert_eq!(index, 3); } #[test] fn test_delta_encode_decode_roundtrip() { // Test that encoding and decoding are inverse operations let test_values = vec![0, 1, 127, 128, 255, 256, 1000, 16384, 65536, 1048576]; for value in test_values { let encoded = delta_encode_size(value); let mut index = 0; let decoded = get_delta_header_size(&encoded, &mut index, encoded.len()); assert_eq!( decoded, value, "Roundtrip failed for value {}: encoded {:?}, decoded {}", value, encoded, decoded ); assert_eq!(index, encoded.len()); } } #[test] fn test_encode_copy_operation_zero_offset() { // Copy from offset 0 let result = encode_copy_operation(0, 10); // Should have copy bit set assert_eq!(result[0] & 0x80, 0x80); // Should encode length 10 assert_eq!(result[0] & 0x10, 0x10); // Length bit 0 set assert_eq!(result[1], 10); assert_eq!(result.len(), 2); } #[test] fn test_encode_copy_operation_small_offset() { // Copy from offset 100, length 20 let result = encode_copy_operation(100, 20); assert_eq!(result[0] & 0x80, 0x80); // Copy bit assert_eq!(result[0] & 0x01, 0x01); // Offset byte 0 present assert_eq!(result[0] & 0x10, 0x10); // Length byte 0 present assert_eq!(result[1], 100); // Offset byte 0 assert_eq!(result[2], 20); // Length byte 0 assert_eq!(result.len(), 3); } #[test] fn test_encode_copy_operation_large_offset() { // Copy from offset 0x12345, length 0x678 let result = encode_copy_operation(0x12345, 0x678); assert_eq!(result[0] & 0x80, 0x80); // Copy bit assert_eq!(result[0] & 0x07, 0x07); // Offset bytes 0,1,2 present assert_eq!(result[0] & 0x30, 0x30); // Length bytes 0,1 present assert_eq!(result[1], 0x45); // Offset byte 0 assert_eq!(result[2], 0x23); // Offset byte 1 assert_eq!(result[3], 0x01); // Offset byte 2 assert_eq!(result[4], 0x78); // Length byte 0 assert_eq!(result[5], 0x06); // Length byte 1 assert_eq!(result.len(), 6); } #[test] fn test_encode_copy_operation_max_offset() { // Test maximum offset (needs 4 bytes) let max_offset = 0xFFFFFFFF; let result = encode_copy_operation(max_offset, 1); assert_eq!(result[0] & 0x80, 0x80); // Copy bit assert_eq!(result[0] & 0x0F, 0x0F); // All 4 offset bytes present assert_eq!(result[1], 0xFF); // Offset byte 0 assert_eq!(result[2], 0xFF); // Offset byte 1 assert_eq!(result[3], 0xFF); // Offset byte 2 assert_eq!(result[4], 0xFF); // Offset byte 3 assert_eq!(result.len(), 6); // 1 cmd + 4 offset + 1 length } #[test] fn test_encode_copy_operation_max_length() { // Test maximum length for version 2 packs (0xFFFF) let result = encode_copy_operation(0, MAX_COPY_LEN); assert_eq!(result[0] & 0x80, 0x80); // Copy bit assert_eq!(result[0] & 0x30, 0x30); // Both length bytes present assert_eq!(result[1], 0xFF); // Length byte 0 assert_eq!(result[2], 0xFF); // Length byte 1 assert_eq!(result.len(), 3); } #[test] fn test_encode_copy_operation_various_lengths() { // Test different length values to ensure correct encoding // Note: only non-zero bytes are encoded // Length 1: byte0=1 -> only bit 4 set let result = encode_copy_operation(0, 1); assert_eq!(result[0] & 0x80, 0x80); assert_eq!(result[0] & 0x30, 0x10); assert_eq!(result[1], 1); // Length 255 (0xFF): byte0=0xFF, byte1=0 -> only bit 4 set let result = encode_copy_operation(0, 255); assert_eq!(result[0] & 0x80, 0x80); assert_eq!(result[0] & 0x30, 0x10); assert_eq!(result[1], 0xFF); // Length 256 (0x100): byte0=0, byte1=1 -> only bit 5 set let result = encode_copy_operation(0, 256); assert_eq!(result[0] & 0x80, 0x80); assert_eq!(result[0] & 0x30, 0x20); // Only second length byte assert_eq!(result[1], 1); // Length 1000 (0x3E8): byte0=0xE8, byte1=3 -> both bits set let result = encode_copy_operation(0, 1000); assert_eq!(result[0] & 0x80, 0x80); assert_eq!(result[0] & 0x30, 0x30); // Both length bytes assert_eq!(result[1], 0xE8); assert_eq!(result[2], 0x03); // Length 0xFFFF: byte0=0xFF, byte1=0xFF -> both bits set let result = encode_copy_operation(0, 0xFFFF); assert_eq!(result[0] & 0x80, 0x80); assert_eq!(result[0] & 0x30, 0x30); // Both length bytes assert_eq!(result[1], 0xFF); assert_eq!(result[2], 0xFF); } #[test] fn test_create_delta_identical() { // Delta between identical buffers should be minimal let base = b"hello world"; let target = b"hello world"; let delta = create_delta_internal(base, target); // Should have header (2 size encodings) plus copy operations assert!(delta.len() < base.len()); // Delta should be smaller than full data } #[test] fn test_create_delta_completely_different() { // Delta between completely different buffers let base = b"aaaaaaaaaa"; let target = b"bbbbbbbbbb"; let delta = create_delta_internal(base, target); // Should have header plus insert operations with the new data assert!(delta.len() > 0); } #[test] fn test_create_and_apply_delta() { // Test that create_delta and apply_delta are inverse operations let base = b"The quick brown fox jumps over the lazy dog"; let target = b"The quick brown cat jumps over the lazy dog"; // Create delta let delta = create_delta_internal(base, target); // Apply delta should reconstruct target let mut index = 0; let src_size = get_delta_header_size(&delta, &mut index, delta.len()); assert_eq!(src_size, base.len()); let dest_size = get_delta_header_size(&delta, &mut index, delta.len()); assert_eq!(dest_size, target.len()); // The delta should be valid and smaller than sending the full target assert!(delta.len() > 0); } #[test] fn test_create_delta_with_insertion() { let base = b"hello"; let target = b"hello world"; let delta = create_delta_internal(base, target); // Should have a copy operation for "hello" and insert for " world" assert!(delta.len() > 0); } #[test] fn test_create_delta_with_deletion() { let base = b"hello world"; let target = b"hello"; let delta = create_delta_internal(base, target); // Should have a copy operation for "hello" only assert!(delta.len() > 0); } } #[pymodule] fn _pack(_py: Python, m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(bisect_find_sha, m)?)?; m.add_function(wrap_pyfunction!(apply_delta, m)?)?; m.add_function(wrap_pyfunction!(create_delta, m)?)?; Ok(()) } dulwich-1.0.0/devscripts/000077500000000000000000000000001513301442600153345ustar00rootroot00000000000000dulwich-1.0.0/devscripts/PREAMBLE.py000066400000000000000000000016221513301442600170760ustar00rootroot00000000000000# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # dulwich-1.0.0/devscripts/replace-preamble.sh000077500000000000000000000004071513301442600210740ustar00rootroot00000000000000#!/usr/bin/zsh # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later perl -i -p0e "s{\Q$(cat PREAMBLE.py.old)\E}{$(cat devscripts/PREAMBLE.py)}g" dulwich/**/*.py bin/dul* perl -i -p0e "s{\Q$(cat PREAMBLE.c.old)\E}{$(cat devscripts/PREAMBLE.c)}g" dulwich/*.c dulwich-1.0.0/disperse.toml000066400000000000000000000006041513301442600156610ustar00rootroot00000000000000tag-name = "dulwich-$VERSION" news-file = "NEWS" verify-command = "make check" twine-upload = false tarball-location = [] release-timeout = 5 ci-timeout = 7200 [[update_version]] path = "dulwich/__init__.py" match = "^__version__ = ((.*))$" new-line = "__version__ = $TUPLED_VERSION" [[update_version]] path = "Cargo.toml" match = '^version = "(.*)"$' new-line = 'version = "$VERSION"' dulwich-1.0.0/docs/000077500000000000000000000000001513301442600140765ustar00rootroot00000000000000dulwich-1.0.0/docs/Makefile000066400000000000000000000065351513301442600155470ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " pdf to make PDF document" @echo " dirhtml to make HTML files named index.html in directories" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* apidocs: sphinx-apidoc -feM -s txt -o api ../dulwich html: apidocs $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: apidocs $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." pickle: apidocs $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: apidocs $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: apidocs $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: apidocs $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/dulwich.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/dulwich.qhc" latex: apidocs $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ "run these through (pdf)latex." changes: apidocs $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." pdf: $(SPHINXBUILD) -b pdf $(ALLSPHINXOPTS) $(BUILDDIR)/pdf @echo @echo "Build finished. The PDF files are in $(BUILDDIR)/pdf." dulwich-1.0.0/docs/api/000077500000000000000000000000001513301442600146475ustar00rootroot00000000000000dulwich-1.0.0/docs/api/index.txt000066400000000000000000000002471513301442600165220ustar00rootroot00000000000000This is the API documentation for Dulwich. Module reference ---------------- .. toctree:: :maxdepth: 3 modules Indices: * :ref:`modindex` * :ref:`search` dulwich-1.0.0/docs/c-git-compatibility.txt000066400000000000000000000365041513301442600205210ustar00rootroot00000000000000.. _c-git-compatibility: ======================== C Git Compatibility ======================== This document lists Git functionality and indicates what Dulwich supports. Dulwich is a pure Python implementation of Git that provides wire-format and repository format compatibility with C Git. Legend: * ✓ - Fully supported * ◐ - Partially supported * ✗ - Not supported Main Porcelain Commands ======================== Repository Management --------------------- * ✓ ``git init`` - Initialize repository * ✓ ``git clone`` - Clone repository * ✓ ``git config`` - Read and write configuration Working with Files ------------------ * ✓ ``git add`` - Add file contents to the index * ✓ ``git rm`` - Remove files from working tree and index * ✓ ``git mv`` - Move or rename file, directory, or symlink * ✓ ``git restore`` - Restore working tree files * ✓ ``git reset`` - Reset current HEAD to specified state * ✓ ``git clean`` - Remove untracked files Commits ------- * ✓ ``git commit`` - Record changes to the repository * ✓ ``git show`` - Show various types of objects * ✓ ``git log`` - Show commit logs * ✓ ``git shortlog`` - Summarize git log output * ✓ ``git describe`` - Describe a commit using the most recent tag * ✓ ``git annotate`` - Annotate file lines with commit information * ✓ ``git blame`` - Show what revision and author last modified each line * ✗ ``git citool`` - Graphical alternative to git-commit * ✗ ``gitk`` - Git repository browser Branches -------- * ✓ ``git branch`` - List, create, or delete branches * ✓ ``git checkout`` - Switch branches or restore working tree files * ✓ ``git switch`` - Switch branches * ✓ ``git show-branch`` - Show branches and their commits * ✓ ``git worktree`` - Manage multiple working trees Tags ---- * ✓ ``git tag`` - Create, list, delete, or verify tags * ✓ ``git verify-tag`` - Check GPG/SSH signature of tags * ✓ ``git verify-commit`` - Check GPG/SSH signature of commits Merging ------- * ✓ ``git merge`` - Join two or more development histories * ✓ ``git merge-base`` - Find common ancestor for merge * ✗ ``git mergetool`` - Run merge conflict resolution tool interactively * ✓ ``git rebase`` - Reapply commits on top of another base tip * ◐ ``git rebase -i`` - Interactive rebase (limited support) * ✓ ``git cherry-pick`` - Apply changes introduced by existing commits * ✓ ``git revert`` - Revert existing commits * ✓ ``git cherry`` - Find commits not merged upstream Remotes ------- * ✓ ``git fetch`` - Download objects and refs from another repository * ✓ ``git pull`` - Fetch from and integrate with another repository * ✓ ``git push`` - Update remote refs along with associated objects * ✓ ``git remote`` - Manage set of tracked repositories * ✓ ``git ls-remote`` - List references in a remote repository Inspection ---------- * ✓ ``git status`` - Show the working tree status * ✓ ``git diff`` - Show changes between commits, commit and working tree, etc * ✓ ``git grep`` - Print lines matching a pattern * ✓ ``git bisect`` - Use binary search to find commit that introduced a bug Patching -------- * ✓ ``git format-patch`` - Prepare patches for email submission * ✗ ``git am`` - Apply series of patches from mailbox * ✗ ``git apply`` - Apply patch to files * ✓ ``git mailsplit`` - Simple UNIX mbox splitter program * ✓ ``git mailinfo`` - Extracts patch and authorship from a single email * ✗ ``git send-email`` - Send collection of patches as emails * ✗ ``git request-pull`` - Generate summary of pending changes Debugging --------- * ✓ ``git fsck`` - Verify connectivity and validity of objects * ✓ ``git check-ignore`` - Debug gitignore / exclude files * ✓ ``git check-mailmap`` - Show canonical names and email addresses * ✗ ``git instaweb`` - Instantly browse your working repository Administration -------------- * ✓ ``git gc`` - Cleanup unnecessary files and optimize repository * ✓ ``git reflog`` - Manage reflog information * ✓ ``git filter-branch`` - Rewrite branches * ✓ ``git maintenance`` - Run tasks to optimize Git repository data * ✓ ``git prune`` - Prune all unreachable objects * ✓ ``git repack`` - Pack unpacked objects in a repository * ✓ ``git count-objects`` - Count unpacked number of objects Server Side ----------- * ✓ ``git daemon`` - A really simple server for Git repositories * ✓ ``git update-server-info`` - Update auxiliary info file * ✓ ``git upload-pack`` - Send objects packed back to git-fetch-pack * ✓ ``git receive-pack`` - Receive what is pushed into the repository Other ----- * ✓ ``git archive`` - Create archive of files from named tree * ✓ ``git bundle`` - Create, unpack, and manipulate bundle files * ✓ ``git stash`` - Stash changes in dirty working directory * ✓ ``git submodule`` - Initialize, update or inspect submodules * ✓ ``git notes`` - Add or inspect object notes * ✓ ``git replace`` - Create, list, delete refs to replace objects * ✓ ``git rerere`` - Reuse recorded resolution of conflicted merges * ✓ ``git help`` - Display help information * ◐ ``git fast-export`` - Export repository data (API only, see fastexport module) * ◐ ``git fast-import`` - Import repository data (API only, see fastexport module) * ✗ ``git gui`` - Portable graphical interface to Git * ✗ ``git web--browse`` - Launch web browser to view HTML documentation * ✗ ``git difftool`` - Show changes using external diff tool * ✗ ``git range-diff`` - Compare two commit ranges * ✗ ``git bugreport`` - Collect information for bug reports * ✓ ``git diagnose`` - Display diagnostic information about the environment * ✗ ``git fsmonitor--daemon`` - Filesystem monitor daemon * ✗ ``git scalar`` - Manage large Git repositories Plumbing Commands ================= Manipulation ------------ * ✗ ``git apply`` - Apply patch to files * ◐ ``git checkout-index`` - Copy files from index to working tree (API only) * ✓ ``git commit-tree`` - Create new commit object * ◐ ``git hash-object`` - Compute object ID (API only) * ◐ ``git index-pack`` - Build pack index file (API only) * ◐ ``git merge-file`` - Run three-way file merge (API only) * ✓ ``git merge-tree`` - Show three-way merge without touching index * ◐ ``git mktag`` - Create tag object (API only) * ✓ ``git pack-objects`` - Create packed archive of objects * ◐ ``git prune-packed`` - Remove extra objects (API only) * ◐ ``git read-tree`` - Read tree information into index (API only) * ✓ ``git symbolic-ref`` - Read, modify and delete symbolic refs * ✓ ``git unpack-objects`` - Unpack objects from packed archive * ◐ ``git update-index`` - Register file contents in working tree to index (API only) * ◐ ``git update-ref`` - Update object name stored in a ref (API only) * ✓ ``git write-tree`` - Create tree object from current index * ✗ ``git mktree`` - Build tree object from ls-tree formatted text Interrogation ------------- * ◐ ``git cat-file`` - Provide content or type and size information (API only) * ◐ ``git diff-files`` - Compare files in working tree and index (API only) * ◐ ``git diff-index`` - Compare content and mode of blobs (API only) * ✓ ``git diff-tree`` - Compare content and mode of trees * ✓ ``git for-each-ref`` - Output information on each ref * ✓ ``git ls-files`` - Show information about files in index and working tree * ✓ ``git ls-remote`` - List references in remote repository * ✓ ``git ls-tree`` - List contents of tree object * ✓ ``git merge-base`` - Find common ancestor * ◐ ``git name-rev`` - Find symbolic names for revisions (API only) * ✓ ``git pack-refs`` - Pack heads and tags for efficient repository access * ✓ ``git rev-list`` - List commit objects in reverse chronological order * ◐ ``git rev-parse`` - Pick out and massage parameters (API only, see objectspec module) * ◐ ``git show-index`` - Show packed archive index (API only) * ✓ ``git show-ref`` - List references in local repository * ✓ ``git var`` - Show Git logical variable * ◐ ``git verify-pack`` - Validate packed Git archive files (API only) Syncing ------- * ◐ ``git fetch-pack`` - Receive missing objects from another repository (CLI available) * ◐ ``git http-fetch`` - Download from remote Git repository via HTTP (API only) * ◐ ``git send-pack`` - Push objects over Git protocol to another repository (API only) * ✓ ``git update-server-info`` - Update auxiliary info for dumb servers * ✗ ``git http-push`` - Push objects over HTTP to another repository * ✗ ``git upload-archive`` - Send archive back to git-archive Pack Management --------------- * ◐ ``git multi-pack-index`` - Manage multi-pack-index (API only, see midx module) Internal Helpers ---------------- * ◐ ``git check-attr`` - Display gitattributes information (API only, see attrs module) * ✓ ``git check-ignore`` - Debug gitignore / exclude files * ✓ ``git check-mailmap`` - Show canonical names and email addresses * ✓ ``git column`` - Display data in columns * ◐ ``git credential`` - Retrieve and store user credentials (basic support) * ✗ ``git fmt-merge-msg`` - Produce merge commit message * ✓ ``git interpret-trailers`` - Add or parse structured information in commit messages * ✓ ``git mailinfo`` - Extract patch and authorship from single email message * ✓ ``git mailsplit`` - Simple UNIX mbox splitter * ✗ ``git merge-one-file`` - Standard helper program to use with git-merge-index * ◐ ``git patch-id`` - Compute unique ID for patch (API only, see patch module) * ✓ ``git stripspace`` - Remove unnecessary whitespace * ✗ ``git sh-setup`` - Common Git shell script setup code * ✗ ``git sh-i18n`` - Git's i18n setup code for shell scripts File Formats & Protocols ========================= Repository Format ----------------- * ✓ Object storage (loose objects) * ✓ Pack files (.pack) * ✓ Pack indexes (.idx) * ✓ Multi-pack index (.midx) * ✓ Pack bitmaps * ✓ Commit graphs * ✓ SHA-1 object format * ✓ SHA-256 object format * ✓ Reftable format Configuration Files ------------------- * ✓ .git/config * ✓ .gitignore * ✓ .gitattributes * ✓ .mailmap * ✓ .git/info/exclude * ✓ .git/info/attributes * ✓ .gitmodules Ref Storage ----------- * ✓ Loose refs (refs/heads/, refs/tags/, etc.) * ✓ Packed refs (packed-refs) * ✓ Reflog * ✓ Reftable Network Protocols ----------------- * ✓ SSH protocol * ✓ Git protocol (git://) * ✓ HTTP/HTTPS (smart protocol) * ✓ HTTP/HTTPS (dumb protocol) * ✓ File protocol (file://) * ✓ Local repositories * ◐ Protocol v2 (client fetch only, server limited) Transfer Capabilities --------------------- Fetch/Upload-Pack: * ✓ thin-pack - Server: ✓, Client: ✓ * ✓ ofs-delta - Server: ✓, Client: ✓ * ✓ multi_ack - Server: ✓, Client: ✓ * ✓ multi_ack_detailed - Server: ✓, Client: ✓ * ✓ side-band-64k - Server: ✓, Client: ✓ * ✓ shallow - Server: ✓, Client: ✓ * ✓ deepen-since - Server: ✓, Client: ✓ * ✓ deepen-not - Server: ✓, Client: ✓ * ✓ deepen-relative - Server: ✓, Client: ✓ * ✓ include-tag - Server: ✓, Client: ✓ * ◐ no-progress - Server: ✓, Client: ✗ * ✓ symref - Server: ✓, Client: ✓ * ◐ filter - Server: ✓, Client: ◐ (basic support) Push/Receive-Pack: * ✓ report-status - Server: ✓, Client: ✓ * ✓ delete-refs - Server: ✓, Client: ✓ * ✓ quiet - Server: ✓, Client: ✓ * ✓ atomic - Server: ✓, Client: ✓ * ✓ ofs-delta - Server: ✓, Client: ✓ * ✓ side-band-64k - Server: ✓, Client: ✓ General: * ✓ object-format - Server: ✓, Client: ✓ * ✓ agent - Server: ✓, Client: ✓ Advanced Features ================= Signatures ---------- * ✓ GPG commit signing * ✓ GPG tag signing * ✓ GPG signature verification (verify-commit, verify-tag) * ✓ SSH commit signing * ✓ SSH tag signing * ✓ SSH signature verification Filters & Attributes -------------------- * ✓ Clean/smudge filters * ✓ Text/binary detection * ✓ End-of-line conversion (CRLF/LF) * ✓ .gitattributes processing * ✗ Working tree encoding * ✓ Whitespace handling Hooks ----- * ✓ Hook execution * ✓ pre-commit * ✗ prepare-commit-msg * ✓ commit-msg * ✓ post-commit * ✗ pre-rebase * ✗ post-checkout * ✗ post-merge * ✗ pre-push * ✗ pre-receive * ✗ update * ✓ post-receive * ✗ post-update * ✗ push-to-checkout Git LFS ------- * ✓ git-lfs init * ✓ git-lfs track * ✓ git-lfs untrack * ✓ git-lfs ls-files * ✓ git-lfs fetch * ✓ git-lfs pull * ✓ git-lfs push * ✗ git-lfs checkout * ✓ git-lfs clean (filter) * ✓ git-lfs smudge (filter) * ◐ git-lfs pointer (API only via lfs_pointer_check) * ✓ git-lfs migrate * ✓ git-lfs status * ✓ LFS server implementation * ✓ LFS batch API Sparse Checkout --------------- * ✓ Sparse checkout patterns * ✓ Cone mode * ✓ git sparse-checkout init (cone mode) * ✓ git sparse-checkout set * ✓ git sparse-checkout add * ◐ git sparse-checkout list (via API only) * ◐ git sparse-checkout disable (via API only) * ✗ git sparse-checkout reapply Worktrees --------- * ✓ git worktree add * ✓ git worktree list * ✓ git worktree remove * ✓ git worktree prune * ✓ git worktree lock * ✓ git worktree unlock * ✓ git worktree move * ✓ git worktree repair Submodules ---------- * ✓ git submodule add * ✓ git submodule init * ✓ git submodule update * ◐ git submodule status (basic) * ✗ git submodule summary * ✗ git submodule foreach * ✗ git submodule sync * ✗ git submodule deinit * ✗ git submodule absorbgitdirs Notes ----- * ✓ git notes add * ✓ git notes list * ✓ git notes show * ✓ git notes remove * ✗ git notes append * ✗ git notes copy * ✗ git notes merge * ✗ git notes prune * ✗ git notes get-ref Other Advanced Features ----------------------- * ✓ Rerere (reuse recorded resolution) * ✓ Commit graph * ✓ Replace objects * ✓ Grafts * ✓ Info/alternates (alternate object databases) * ✓ Partial clone/fetch * ✓ Shallow clone/fetch * ✓ Bundle files * ✓ Fast-import/fast-export * ✗ Scalar * ◐ Partial clone with object filters (basic blob:none support) Web Interface ============= * ✓ Gitweb-like interface (dulwich.web) * ✓ WSGI application support * ✗ cgit * ✗ GitWeb (Perl implementation) Known Limitations ================= The following Git features are not currently supported: * Git GUIs (gitk, git-gui, git-citool) * Email workflow tools (git-send-email, git-request-pull) * Patch application (git-am, git-apply) * Interactive tools (git-difftool, git-mergetool) * Newer features (range-diff, scalar, fsmonitor--daemon) * Full protocol v2 server support (client is fully supported for fetch) * Some plumbing commands (mktree, http-push, upload-archive, fmt-merge-msg, merge-one-file) * Full submodule feature parity * Some advanced object filtering options * Most git hooks (only pre-commit, commit-msg, post-commit, post-receive) * Working tree encoding attribute Compatibility Notes =================== Repository Compatibility ------------------------ Dulwich maintains full wire-format and on-disk repository format compatibility with C Git. This means: * Dulwich can read and write repositories created by C Git * C Git can read and write repositories created by Dulwich * Dulwich and C Git can be used interchangeably on the same repository * Network protocols are fully compatible See Also ======== * :ref:`tutorial-index` - Tutorial for using Dulwich * :ref:`protocol` - Git protocol documentation * :mod:`dulwich.porcelain` - High-level API reference dulwich-1.0.0/docs/conf.py000066400000000000000000000152531513301442600154030ustar00rootroot00000000000000# # dulwich documentation build configuration file, created by # sphinx-quickstart on Thu Feb 18 23:18:28 2010. # # This file is execfile()d with the current directory set to its containing # dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import os import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath("..")) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__)))) dulwich = __import__("dulwich") # -- General configuration ---------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.ifconfig", "sphinx.ext.intersphinx", "sphinx.ext.napoleon", ] autoclass_content = "both" # Add any paths that contain templates here, relative to this directory. templates_path = ["templates"] # The suffix of source filenames. source_suffix = ".txt" # The encoding of source files. # source_encoding = 'utf-8' # The master toctree document. master_doc = "index" # General information about the project. project = "dulwich" copyright = "2011-2023 Jelmer Vernooij" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = ".".join(map(str, dulwich.__version__[:2])) # The full version, including alpha/beta/rc tags. release = ".".join(map(str, dulwich.__version__)) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # List of documents that shouldn't be included in the build. # unused_docs = [] # List of directories, relative to source directory, that shouldn't be searched # for source files. exclude_trees = ["build"] # The reST default role (used for this markup: `text`) to use for all # documents. # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. # html_theme = 'default' html_theme = "agogo" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. html_theme_path = ["theme"] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. # html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path: list[str] = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. # html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # html_additional_pages = {} # If false, no module index is generated. # html_use_modindex = True # If false, no index is generated. # html_use_index = True # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, links to the reST sources are added to the pages. # html_show_sourcelink = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = '' # Output file base name for HTML help builder. htmlhelp_basename = "dulwichdoc" # -- Options for LaTeX output ------------------------------------------------ # The paper size ('letter' or 'a4'). # latex_paper_size = 'letter' # The font size ('10pt', '11pt' or '12pt'). # latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ ("index", "dulwich.tex", "dulwich Documentation", "Jelmer Vernooij", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # Additional stuff for the LaTeX preamble. # latex_preamble = '' # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. # latex_use_modindex = True # Add mappings intersphinx_mapping = { "urllib3": ("http://urllib3.readthedocs.org/en/latest", None), "python": ("http://docs.python.org/3", None), } dulwich-1.0.0/docs/index.txt000066400000000000000000000007221513301442600157470ustar00rootroot00000000000000.. _index: ====================================== dulwich - Python implementation of Git ====================================== Overview ======== .. include:: ../README.rst Documentation ============= .. toctree:: :maxdepth: 2 c-git-compatibility protocol tutorial/index recipes/index api/index Changelog ========= .. include:: ../NEWS Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` dulwich-1.0.0/docs/make.bat000066400000000000000000000063041513301442600155060ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation set SPHINXBUILD=sphinx-build set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. pdf to make PDF document echo. dirhtml to make HTML files named index.html in directories echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. changes to make an overview over all changed/added/deprecated items echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\dulwich.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\dulwich.ghc goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "pdf" ( %SPHINXBUILD% -b pdf %ALLSPHINXOPTS% %BUILDDIR%/pdf echo. echo.Build finished. The PDF files are in %BUILDDIR%/pdf. goto end ) :end dulwich-1.0.0/docs/protocol.txt000066400000000000000000000043441513301442600165050ustar00rootroot00000000000000.. _protocol: =================== Git Server Protocol =================== Transport ========= The Git protocol operates over pipes or TCP/IP. When a client connects over TCP/IP, it sends a header that tells the server which program to run and what parameters to use. When invoked over SSH, git will run a program with the parameters as command line arguments. Protocols ========= Basics ------ Git communicates with a server by piping data between a local program and a remote program. A common way of sending a unit of information is a pkt_line. This is a 4 byte size as human encoded hex (i.e. totally underusing the 4 bytes...) that tells you the size of the payload, followed by the payload. The size includes the 4 bytes used by the size itself. 0009ABCD\n Git can also multiplex data using the sideband. As well as 4 bytes size, there would be a 1 byte channel number. This is in binary, so ``1`` will be ``\x01``. Typically Git will piggyback a list of capabilities on the first pkt_line it sends. It will also look for capabilities in the first pkt_like it receives. Git will degrade as much as possible when encountering a server or client with differing capabilities. git-upload-pack --------------- git-upload pack is used by git-ls-remote, git-clone, git-fetch and git-pull. And i'm sure others. Typically a client will connect a local git-fetch-pack to a remote git-upload-pack. Capabilities for this protocol include multi_ack, thin-pack, ofs-delta, sideband and sideband-64k A thin pack can reference objects not in the current pack. The server tells the client what refs it has. The client states which of those SHA1's it would like. It then starts to report which SHA1's it has. The server ACKs these allowing the client to work out when to stop sending SHA1's. This saves a lot of transfer because the client can make decisions like "well if it has this SHA, then it has all its parents so I don't need to care about those". When the client stops sending shas, the server can work out an optimal pack and then send it to the client. git-receive-pack ---------------- git-receive-pack is used by git push. Typically a client connects a local git-send-pack to a remote git-receive-pack. Capabilities include report-status and delete-ref. dulwich-1.0.0/docs/recipes/000077500000000000000000000000001513301442600155305ustar00rootroot00000000000000dulwich-1.0.0/docs/recipes/index.txt000066400000000000000000000105041513301442600174000ustar00rootroot00000000000000.. _recipes: ======================== Recipes for common tasks ======================== How do I check out files with Dulwich? ====================================== The answer depends on the exact meaning of "check out" that is intended. There are several common goals it could be describing, and correspondingly several options to achieve them. Make sure a working tree on disk matches a particular commit (like ``git checkout``) ------------------------------------------------------------------------------------ :py:func:`dulwich.porcelain.checkout` is a very high-level function that operates on the working tree and behaves very similar to the ``git checkout`` command. It packages a lot of functionality into a single command, just as Git's porcelain does, which is useful when matching Git's CLI is the goal, but might be less desirable for programmatic access to a repository's contents. Retrieve a single file's contents at a particular commit -------------------------------------------------------- :py:func:`dulwich.object_store.tree_lookup_path` can a retrieve the object SHA given its path and the SHA of a tree to look it up in. This makes it very easy to access a specific file as stored in the repo. Note that this function operates on *trees*, not *commits* (every commit contains a tree for its contents, but a commit's ID is not the same as its tree's ID). With the retrieved SHA it's possible to get a file's blob directly from the repository's object store, and thus its content bytes. It's also possible to write it out to disk, using :py:func:`dulwich.index.build_file_from_blob`, which takes care of things like symlinks and file permissions. .. code-block:: python from dulwich.repo import Repo from dulwich.objectspec import parse_commit from dulwich.object_store import tree_lookup_path repo = Repo("/path/to/some/repo") # parse_commit will understand most commonly-used types of Git refs, including # short SHAs, tag names, branch names, HEAD, etc. commit = parse_commit(repo, "v1.0.0") path = b"README.md" mode, sha = tree_lookup_path(repo.get_object, commit.tree, path) # Normalizer takes care of line ending conversion and applying smudge # filters during checkout. See the Git Book for more details: # https://git-scm.com/book/ms/v2/Customizing-Git-Git-Attributes blob = repo.get_blob_normalizer().checkout_normalize(repo[sha], path) print(f"The readme at {commit.id.decode('ascii')} is:") print(blob.data.decode("utf-8")) Retrieve all or a subset of files at a particular commit -------------------------------------------------------- A dedicated helper function :py:func:`dulwich.object_store.iter_commit_contents` exists to simplify the common requirement of programmatically getting the contents of a repo as stored at a specific commit. Unlike :py:func:`!porcelain.checkout`, it is not tied to a working tree, or even files. When paired with :py:func:`dulwich.index.build_file_from_blob`, it's very easy to write out the retrieved files to an arbitrary location on disk, independent of any working trees. This makes it ideal for tasks such as retrieving a pristine copy of the contained files without any of Git's tracking information, for use in deployments, automation, and similar. .. code-block:: python import stat from pathlib import Path from dulwich.repo import Repo from dulwich.object_store import iter_commit_contents from dulwich.index import build_file_from_blob repo = Repo("/path/to/another/repo") normalize = repo.get_blob_normalizer().checkout_normalize commit = repo[repo.head()] encoding = commit.encoding or "utf-8" # Scan the repo at current HEAD. Retrieve all files marked as # executable under bin/ and write them to disk for entry in iter_commit_contents(repo, commit.id, include=[b"bin"]): if entry.mode & stat.S_IXUSR: # Strip the leading bin/ from returned paths, write to # current directory path = Path(entry.path.decode(encoding)).relative_to("bin/") # Make sure the target directory exists path.parent.mkdir(parents=True, exist_ok=True) blob = normalize(repo[entry.sha], entry.path) build_file_from_blob( blob, entry.mode, str(path) ) print(f"Wrote executable {path}") dulwich-1.0.0/docs/requirements.txt000066400000000000000000000000001513301442600173500ustar00rootroot00000000000000dulwich-1.0.0/docs/tutorial/000077500000000000000000000000001513301442600157415ustar00rootroot00000000000000dulwich-1.0.0/docs/tutorial/.gitignore000066400000000000000000000000161513301442600177260ustar00rootroot00000000000000*.html myrepo dulwich-1.0.0/docs/tutorial/Makefile000066400000000000000000000002321513301442600173760ustar00rootroot00000000000000RST2HTML = rst2html TXT=$(shell ls *.txt) ALL: index.html index.html: $(TXT) $(RST2HTML) index.txt index.html clean: rm -f index.html .PHONY: clean dulwich-1.0.0/docs/tutorial/conclusion.txt000066400000000000000000000005021513301442600206530ustar00rootroot00000000000000.. _tutorial-conclusion: Conclusion ========== This tutorial currently only covers a small (but important) part of Dulwich. It still needs to be extended to cover packs, refs, reflogs and network communication. Dulwich is abstracting much of the Git plumbing, so there would be more to see. For now, that's all folks! dulwich-1.0.0/docs/tutorial/encoding.txt000066400000000000000000000022141513301442600202670ustar00rootroot00000000000000Encoding ======== You will notice that all lower-level functions in Dulwich take byte strings rather than unicode strings. This is intentional. Although `C git`_ recommends the use of UTF-8 for encoding, this is not strictly enforced and C git treats filenames as sequences of non-NUL bytes. There are repositories in the wild that use non-UTF-8 encoding for filenames and commit messages. .. _C git: https://github.com/git/git/blob/master/Documentation/i18n.txt The library should be able to read *all* existing git repositories, regardless of what encoding they use. This is the main reason why Dulwich does not convert paths to unicode strings. A further consideration is that converting back and forth to unicode is an extra performance penalty. E.g. if you are just iterating over file contents, there is no need to consider encoded strings. Users of the library may have specific assumptions they can make about the encoding - e.g. they could just decide that all their data is latin-1, or the default Python encoding. Higher level functions, such as the porcelain in dulwich.porcelain, will automatically convert unicode strings to UTF-8 bytestrings. dulwich-1.0.0/docs/tutorial/file-format.txt000066400000000000000000000060321513301442600207100ustar00rootroot00000000000000Git File format =============== For a better understanding of Dulwich, we'll start by explaining most of the Git secrets. Open the ".git" folder of any Git-managed repository. You'll find folders like "branches", "hooks"... We're only interested in "objects" here. Open it. You'll mostly see 2 hex-digits folders. Git identifies content by its SHA-1 digest. The 2 hex-digits plus the 38 hex-digits of files inside these folders form the 40 characters (or 20 bytes) id of Git objects you'll manage in Dulwich. We'll first study the three main objects: - The Commit; - The Tree; - The Blob. The Commit ---------- You're used to generate commits using Git. You have set up your name and e-mail, and you know how to see the history using ``git log``. A commit file looks like this:: commit tree parent [parent if several parents from merges] author committer But where are the changes you committed? The commit contains a reference to a tree. The Tree -------- A tree is a collection of file information, the state of a single directory at a given point in time. A tree file looks like this:: tree ... And repeats for every file in the tree. Note that the SHA-1 digest is in binary form here. The file mode is like the octal argument you could give to the ``chmod`` command. Except it is in extended form to tell regular files from directories and other types. We now know how our files are referenced but we haven't found their actual content yet. That's where the reference to a blob comes in. The Blob -------- A blob is simply the content of files you are versioning. A blob file looks like this:: blob If you change a single line, another blob will be generated by Git each time you successfully run ``git add``. This is how Git can fastly checkout any version in time. On the opposite, several identical files with different filenames generate only one blob. That's mostly how renames are so cheap and efficient in Git. Dulwich Objects --------------- Dulwich implements these three objects with an API to easily access the information you need, while abstracting some more secrets Git is using to accelerate operations and reduce space. More About Git formats ---------------------- These three objects make up most of the contents of a Git repository and are used for the history. They can either appear as simple files on disk (one file per object) or in a ``pack`` file, which is a container for a number of these objects. There is also an index of the current state of the working copy in the repository as well as files to track the existing branches and tags. For a more detailed explanation of object formats and SHA-1 digests, see: http://www-cs-students.stanford.edu/~blynn/gitmagic/ch08.html Just note that recent versions of Git compress object files using zlib. dulwich-1.0.0/docs/tutorial/index.txt000066400000000000000000000002721513301442600176120ustar00rootroot00000000000000.. _tutorial: ======== Tutorial ======== .. toctree:: :maxdepth: 2 introduction encoding file-format repo object-store remote tag porcelain conclusion dulwich-1.0.0/docs/tutorial/introduction.txt000066400000000000000000000012571513301442600212300ustar00rootroot00000000000000.. _tutorial-introduction: Introduction ============ Like Git itself, Dulwich consists of two main layers; the so-called plumbing and the porcelain. The plumbing is the lower layer and it deals with the Git object database and the nitty gritty internals. The porcelain is roughly what you would expect to be exposed to as a user of the ``git`` command-like tool. Dulwich has a fairly complete plumbing implementation, and a more recently added porcelain implementation. The porcelain code lives in ``dulwich.porcelain``. For the large part, this tutorial introduces you to the internal concepts of Git and the main plumbing parts of Dulwich. The last chapter covers the porcelain. dulwich-1.0.0/docs/tutorial/object-store.txt000066400000000000000000000141421513301442600211040ustar00rootroot00000000000000.. _tutorial-object-store: The object store ================ The objects are stored in the ``object store`` of the repository. >>> from dulwich.repo import Repo >>> repo = Repo.init("myrepo", mkdir=True) Initial commit -------------- When you use Git, you generally add or modify content. As our repository is empty for now, we'll start by adding a new file:: >>> from dulwich.objects import Blob >>> blob = Blob.from_string(b"My file content\n") >>> print(blob.id.decode('ascii')) c55063a4d5d37aa1af2b2dad3a70aa34dae54dc6 Of course you could create a blob from an existing file using ``from_file`` instead. As said in the introduction, file content is separated from file name. Let's give this content a name:: >>> from dulwich.objects import Tree >>> tree = Tree() >>> tree.add(b"spam", 0o100644, blob.id) Note that "0o100644" is the octal form for a regular file with common permissions. You can hardcode them or you can use the ``stat`` module. The tree state of our repository still needs to be placed in time. That's the job of the commit:: >>> from dulwich.objects import Commit, parse_timezone >>> from time import time >>> commit = Commit() >>> commit.tree = tree.id >>> author = b"Your Name " >>> commit.author = commit.committer = author >>> commit.commit_time = commit.author_time = int(time()) >>> tz = parse_timezone(b'-0200')[0] >>> commit.commit_timezone = commit.author_timezone = tz >>> commit.encoding = b"UTF-8" >>> commit.message = b"Initial commit" Note that the initial commit has no parents. At this point, the repository is still empty because all operations happen in memory. Let's "commit" it. >>> object_store = repo.object_store >>> object_store.add_object(blob) Now the ".git/objects" folder contains a first SHA-1 file. Let's continue saving the changes:: >>> object_store.add_object(tree) >>> object_store.add_object(commit) Now the physical repository contains three objects but still has no branch. Let's create the master branch like Git would:: >>> repo.refs[b'refs/heads/master'] = commit.id The master branch now has a commit where to start. When we commit to master, we are also moving HEAD, which is Git's currently checked out branch: >>> head = repo.refs[b'HEAD'] >>> head == commit.id True >>> head == repo.refs[b'refs/heads/master'] True How did that work? As it turns out, HEAD is a special kind of ref called a symbolic ref, and it points at master. Most functions on the refs container work transparently with symbolic refs, but we can also take a peek inside HEAD: >>> import sys >>> print(repo.refs.read_ref(b'HEAD').decode(sys.getfilesystemencoding())) ref: refs/heads/master Normally, you won't need to use read_ref. If you want to change what ref HEAD points to, in order to check out another branch, just use set_symbolic_ref. Now our repository is officially tracking a branch named "master" referring to a single commit. Playing again with Git ---------------------- At this point you can come back to the shell, go into the "myrepo" folder and type ``git status`` to let Git confirm that this is a regular repository on branch "master". Git will tell you that the file "spam" is deleted, which is normal because Git is comparing the repository state with the current working copy. And we have absolutely no working copy using Dulwich because we don't need it at all! You can checkout the last state using ``git checkout -f``. The force flag will prevent Git from complaining that there are uncommitted changes in the working copy. The file ``spam`` appears and with no surprise contains the same bytes as the blob:: $ cat spam My file content Changing a File and Committing it --------------------------------- Now we have a first commit, the next one will show a difference. As seen in the introduction, it's about making a path in a tree point to a new blob. The old blob will remain to compute the diff. The tree is altered and the new commit'task is to point to this new version. Let's first build the blob:: >>> from dulwich.objects import Blob >>> spam = Blob.from_string(b"My new file content\n") >>> print(spam.id.decode('ascii')) 16ee2682887a962f854ebd25a61db16ef4efe49f An alternative is to alter the previously constructed blob object:: >>> blob.data = b"My new file content\n" >>> print(blob.id.decode('ascii')) 16ee2682887a962f854ebd25a61db16ef4efe49f In any case, update the blob id known as "spam". You also have the opportunity of changing its mode:: >>> tree[b"spam"] = (0o100644, spam.id) Now let's record the change:: >>> from dulwich.objects import Commit >>> from time import time >>> c2 = Commit() >>> c2.tree = tree.id >>> c2.parents = [commit.id] >>> c2.author = c2.committer = b"John Doe " >>> c2.commit_time = c2.author_time = int(time()) >>> c2.commit_timezone = c2.author_timezone = 0 >>> c2.encoding = b"UTF-8" >>> c2.message = b'Changing "spam"' In this new commit we record the changed tree id, and most important, the previous commit as the parent. Parents are actually a list because a commit may happen to have several parents after merging branches. Let's put the objects in the object store:: >>> repo.object_store.add_object(spam) >>> repo.object_store.add_object(tree) >>> repo.object_store.add_object(c2) You can already ask git to introspect this commit using ``git show`` and the value of ``c2.id`` as an argument. You'll see the difference will the previous blob recorded as "spam". The diff between the previous head and the new one can be printed using write_tree_diff:: >>> from dulwich.patch import write_tree_diff >>> from io import BytesIO >>> out = BytesIO() >>> write_tree_diff(out, repo.object_store, commit.tree, tree.id) >>> import sys; _ = sys.stdout.write(out.getvalue().decode('ascii')) diff --git a/spam b/spam index c55063a..16ee268 100644 --- a/spam +++ b/spam @@ -1 +1 @@ -My file content +My new file content You won't see it using git log because the head is still the previous commit. It's easy to remedy:: >>> repo.refs[b'refs/heads/master'] = c2.id Now all git tools will work as expected. dulwich-1.0.0/docs/tutorial/porcelain.txt000066400000000000000000000026151513301442600204620ustar00rootroot00000000000000Porcelain ========= The ``porcelain`` is the higher level interface, built on top of the lower level implementation covered in previous chapters of this tutorial. The ``dulwich.porcelain`` module in Dulwich is aimed to closely resemble the Git command-line API that you are familiar with. Basic concepts -------------- The porcelain operations are implemented as top-level functions in the ``dulwich.porcelain`` module. Most arguments can either be strings or more complex Dulwich objects; e.g. a repository argument will either take a string with a path to the repository or an instance of a ``Repo`` object. Initializing a new repository ----------------------------- >>> from dulwich import porcelain >>> repo = porcelain.init("myrepo") Clone a repository ------------------ >>> porcelain.clone("git://github.com/jelmer/dulwich", "dulwich-clone") Basic authentication works using the ``username`` and ``password`` parameters: >>> porcelain.clone( "https://example.com/a-private-repo.git", "a-private-repo-clone", username="user", password="password") Commit changes -------------- >>> r = porcelain.init("testrepo") >>> open("testrepo/testfile", "w").write("data") >>> porcelain.add(r, "testfile") >>> porcelain.commit(r, b"A sample commit") Push changes ------------ >>> tr = porcelain.init("targetrepo") >>> r = porcelain.push("testrepo", "targetrepo", "master") dulwich-1.0.0/docs/tutorial/remote.txt000066400000000000000000000064771513301442600200130ustar00rootroot00000000000000.. _tutorial-remote: Most of the tests in this file require a Dulwich server, so let's start one: >>> from dulwich.repo import Repo >>> from dulwich.server import DictBackend, TCPGitServer >>> import threading >>> repo = Repo.init("remote", mkdir=True) >>> cid = repo.get_worktree().commit(b"message", committer=b"Jelmer ") >>> backend = DictBackend({b'/': repo}) >>> dul_server = TCPGitServer(backend, b'localhost', 0) >>> server_thread = threading.Thread(target=dul_server.serve) >>> server_thread.start() >>> server_address, server_port=dul_server.socket.getsockname() Remote repositories =================== The interface for remote Git repositories is different from that for local repositories. The Git smart server protocol provides three basic operations: * upload-pack - provides a pack with objects requested by the client * receive-pack - imports a pack with objects provided by the client * upload-archive - provides a tarball with the contents of a specific revision The smart server protocol can be accessed over either plain TCP (git://), SSH (git+ssh://) or tunneled over HTTP (http://). Dulwich provides support for accessing remote repositories in ``dulwich.client``. To create a new client, you can construct one manually:: >>> from dulwich.client import TCPGitClient >>> client = TCPGitClient(server_address, server_port) Retrieving raw pack files ------------------------- The client object can then be used to retrieve a pack. The ``fetch_pack`` method takes a ``determine_wants`` callback argument, which allows the client to determine which objects it wants to end up with:: >>> def determine_wants(refs, depth=None): ... # retrieve all objects ... return refs.values() Note that the ``depth`` keyword argument will contain an optional requested shallow fetch depth. Another required object is a "graph walker", which is used to determine which objects that the client already has should not be sent again by the server. Here in the tutorial we'll just use a dummy graph walker which claims that the client doesn't have any objects:: >>> class DummyGraphWalker(object): ... def __init__(self): ... self.shallow = set() ... def ack(self, sha): pass ... def nak(self): pass ... def next(self): pass ... def __next__(self): pass With the ``determine_wants`` function in place, we can now fetch a pack, which we will write to a ``BytesIO`` object:: >>> from io import BytesIO >>> f = BytesIO() >>> result = client.fetch_pack(b"/", determine_wants, ... DummyGraphWalker(), pack_data=f.write) ``f`` will now contain a full pack file:: >>> print(f.getvalue()[:4].decode('ascii')) PACK Fetching objects into a local repository ---------------------------------------- It is also possible to fetch from a remote repository into a local repository, in which case Dulwich takes care of providing the right graph walker, and importing the received pack file into the local repository:: >>> from dulwich.repo import Repo >>> local = Repo.init("local", mkdir=True) >>> remote_refs = client.fetch(b"/", local) >>> local.close() Let's shut down the server now that all tests have been run:: >>> client.close() >>> dul_server.shutdown() >>> dul_server.server_close() >>> repo.close() dulwich-1.0.0/docs/tutorial/repo.txt000066400000000000000000000057451513301442600174620ustar00rootroot00000000000000.. _tutorial-repo: The repository ============== After this introduction, let's start directly with code:: >>> from dulwich.repo import Repo The access to a repository is through the Repo object. You can open an existing repository or you can create a new one. There are two types of Git repositories: Regular Repositories -- They are the ones you create using ``git init`` and you daily use. They contain a ``.git`` folder. Bare Repositories -- There is no ".git" folder. The top-level folder contains itself the "branches", "hooks"... folders. These are used for published repositories (mirrors). They do not have a working tree. Creating a repository --------------------- Let's create a folder and turn it into a repository, like ``git init`` would:: >>> from os import mkdir >>> import sys >>> mkdir("myrepo") >>> repo = Repo.init("myrepo") >>> repo You can already look at the structure of the "myrepo/.git" folder, though it is mostly empty for now. Opening an existing repository ------------------------------ To reopen an existing repository, simply pass its path to the constructor of ``Repo``:: >>> repo = Repo("myrepo") >>> repo Opening the index ----------------- The index is used as a staging area. Once you do a commit, the files tracked in the index will be recorded as the contents of the new commit. As mentioned earlier, only non-bare repositories have a working tree, so only non-bare repositories will have an index, too. To open the index, simply call:: >>> index = repo.open_index() >>> print(index.path) myrepo/.git/index Since the repository was just created, the index will be empty:: >>> list(index) [] Staging new files ----------------- The repository allows "staging" files. Only files can be staged - directories aren't tracked explicitly by git. Let's create a simple text file and stage it:: >>> f = open('myrepo/foo', 'wb') >>> _ = f.write(b"monty") >>> f.close() >>> repo.get_worktree().stage([b"foo"]) It will now show up in the index:: >>> print(",".join([f.decode(sys.getfilesystemencoding()) for f in repo.open_index()])) foo Creating new commits -------------------- Now that we have staged a change, we can commit it. The easiest way to do this is by using ``WorkTree.commit``. It is also possible to manipulate the lower-level objects involved in this, but we'll leave that for a separate chapter of the tutorial. To create a simple commit on the current branch, it is only necessary to specify the message. The committer and author will be retrieved from the repository configuration or global configuration if they are not specified:: >>> commit_id = repo.get_worktree().commit( ... b"The first commit", committer=b"Jelmer Vernooij ") ``commit`` returns the SHA1 of the commit. Since the commit was to the default branch, the repository's head will now be set to that commit:: >>> repo.head() == commit_id True dulwich-1.0.0/docs/tutorial/tag.txt000066400000000000000000000033421513301442600172570ustar00rootroot00000000000000.. _tutorial-tag: Tagging ======= This tutorial will demonstrate how to add a tag to a commit via dulwich. First let's initialize the repository: >>> from dulwich.repo import Repo >>> _repo = Repo("myrepo", mkdir=True) Next we build the commit object and add it to the object store: >>> from dulwich.objects import Blob, Tree, Commit, parse_timezone >>> permissions = 0100644 >>> author = "John Smith" >>> blob = Blob.from_string("empty") >>> tree = Tree() >>> tree.add(tag, permissions, blob.id) >>> commit = Commit() >>> commit.tree = tree.id >>> commit.author = commit.committer = author >>> commit.commit_time = commit.author_time = int(time()) >>> tz = parse_timezone('-0200')[0] >>> commit.commit_timezone = commit.author_timezone = tz >>> commit.encoding = "UTF-8" >>> commit.message = 'Tagging repo: ' + message Add objects to the repo store instance: >>> object_store = _repo.object_store >>> object_store.add_object(blob) >>> object_store.add_object(tree) >>> object_store.add_object(commit) >>> master_branch = 'master' >>> _repo.refs['refs/heads/' + master_branch] = commit.id Finally, add the tag top the repo: >>> _repo['refs/tags/' + commit] = commit.id Alternatively, we can use the tag object if we'd like to annotate the tag: >>> from dulwich.objects import Blob, Tree, Commit, parse_timezone, Tag >>> tag_message = "Tag Annotation" >>> tag = Tag() >>> tag.tagger = author >>> tag.message = message >>> tag.name = "v0.1" >>> tag.object = (Commit, commit.id) >>> tag.tag_time = commit.author_time >>> tag.tag_timezone = tz >>> object_store.add_object(tag) >>> _repo['refs/tags/' + tag] = tag.id dulwich-1.0.0/dulwich.cfg000066400000000000000000000001731513301442600152670ustar00rootroot00000000000000packages: dulwich docformat: restructuredtext projectname: Dulwich projecturl: https://www.dulwich.io/ htmloutput: apidocs dulwich-1.0.0/dulwich/000077500000000000000000000000001513301442600146055ustar00rootroot00000000000000dulwich-1.0.0/dulwich/__init__.py000066400000000000000000000070161513301442600167220ustar00rootroot00000000000000# __init__.py -- The git module of dulwich # Copyright (C) 2007 James Westby # Copyright (C) 2008 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Python implementation of the Git file formats and protocols.""" from collections.abc import Callable from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar __version__ = (1, 0, 0) __all__ = ["__version__", "replace_me"] P = ParamSpec("P") R = TypeVar("R") F = TypeVar("F", bound=Callable[..., Any]) if TYPE_CHECKING: # For type checking, always use our typed signature def replace_me( since: tuple[int, ...] | str | None = None, remove_in: tuple[int, ...] | str | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Decorator to mark functions as deprecated.""" ... else: try: from dissolve import replace_me as replace_me except ImportError: # if dissolve is not installed, then just provide a basic implementation # of its replace_me decorator def replace_me( since: tuple[int, ...] | str | None = None, remove_in: tuple[int, ...] | str | None = None, ) -> Callable[[Callable[P, R]], Callable[P, R]]: """Decorator to mark functions as deprecated. Args: since: Version when the function was deprecated remove_in: Version when the function will be removed Returns: Decorator function """ def decorator(func: Callable[P, R]) -> Callable[P, R]: import functools import warnings m = f"{func.__name__} is deprecated" since_str = str(since) if since is not None else None remove_in_str = str(remove_in) if remove_in is not None else None if since_str is not None and remove_in_str is not None: m += f" since {since_str} and will be removed in {remove_in_str}" elif since_str is not None: m += f" since {since_str}" elif remove_in_str is not None: m += f" and will be removed in {remove_in_str}" else: m += " and will be removed in a future version" @functools.wraps(func) def _wrapped_func(*args: P.args, **kwargs: P.kwargs) -> R: warnings.warn( m, DeprecationWarning, stacklevel=2, ) return func(*args, **kwargs) return _wrapped_func return decorator dulwich-1.0.0/dulwich/__main__.py000066400000000000000000000024701513301442600167020ustar00rootroot00000000000000# __main__.py -- Entry point for running dulwich as a module # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Entry point for running dulwich as a module. This module allows dulwich to be run as a Python module using the -m flag: python -m dulwich It serves as the main entry point for the dulwich command-line interface. """ __all__ = [] from . import cli if __name__ == "__main__": cli._main() dulwich-1.0.0/dulwich/_typing.py000066400000000000000000000023101513301442600166240ustar00rootroot00000000000000# _typing.py -- Common type definitions for Dulwich # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Common type definitions for Dulwich.""" import sys if sys.version_info >= (3, 12): from collections.abc import Buffer else: Buffer = bytes | bytearray | memoryview __all__ = ["Buffer"] dulwich-1.0.0/dulwich/aiohttp/000077500000000000000000000000001513301442600162555ustar00rootroot00000000000000dulwich-1.0.0/dulwich/aiohttp/__init__.py000066400000000000000000000020211513301442600203610ustar00rootroot00000000000000# __init__.py -- aiohttp support # Copyright (C) 2022 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """aiohttp support for Dulwich.""" dulwich-1.0.0/dulwich/aiohttp/server.py000066400000000000000000000271161513301442600201440ustar00rootroot00000000000000# aiohttp.py -- aiohttp smart client/server # Copyright (C) 2022 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """aiohttp client/server support.""" import asyncio import sys from io import BytesIO from typing import BinaryIO, cast from aiohttp import web from .. import log_utils from ..errors import HangupException from ..objects import ObjectID from ..protocol import ReceivableProtocol from ..repo import Repo from ..server import ( DEFAULT_HANDLERS, BackendRepo, DictBackend, generate_info_refs, generate_objects_info_packs, ) from ..web import NO_CACHE_HEADERS, cache_forever_headers logger = log_utils.getLogger(__name__) # Application keys for type-safe access to app state REPO_KEY = web.AppKey("repo", Repo) HANDLERS_KEY = web.AppKey("handlers", dict) DUMB_KEY = web.AppKey("dumb", bool) async def send_file( req: web.Request, f: BinaryIO | None, headers: dict[str, str] ) -> web.StreamResponse: """Send a file-like object to the request output. Args: req: The HTTPGitRequest object to send output to. f: An open file-like object to send; will be closed. headers: Headers to send Returns: Iterator over the contents of the file, as chunks. """ if f is None: raise web.HTTPNotFound(text="File not found") response = web.StreamResponse(status=200, reason="OK", headers=headers) await response.prepare(req) try: while True: data = f.read(10240) if not data: break await response.write(data) except OSError: raise web.HTTPInternalServerError(text="Error reading file") finally: f.close() await response.write_eof() return response async def get_loose_object(request: web.Request) -> web.Response: """Handle request for a loose object. Args: request: aiohttp request object Returns: Response with the loose object data """ sha = ObjectID( (request.match_info["dir"] + request.match_info["file"]).encode("ascii") ) logger.info("Sending loose object %s", sha) object_store = request.app[REPO_KEY].object_store if not object_store.contains_loose(sha): raise web.HTTPNotFound(text="Object not found") try: data = object_store[sha].as_legacy_object() except OSError: raise web.HTTPInternalServerError(text="Error reading object") headers = {"Content-Type": "application/x-git-loose-object"} headers.update(cache_forever_headers()) return web.Response(status=200, headers=headers, body=data) async def get_text_file(request: web.Request) -> web.StreamResponse: """Handle request for a text file. Args: request: aiohttp request object Returns: Response with the text file contents """ headers = {"Content-Type": "text/plain"} headers.update(NO_CACHE_HEADERS) path = request.match_info["file"] logger.info("Sending plain text file %s", path) repo = request.app[REPO_KEY] return await send_file(request, repo.get_named_file(path), headers) async def refs_request( repo: Repo, request: web.Request, handlers: dict[bytes, type] | None = None ) -> web.StreamResponse | web.Response: """Handle a refs request. Args: repo: Repository object request: aiohttp request object handlers: Optional dict of service handlers Returns: Response with refs information """ service = request.query.get("service") if service: if handlers is None: handlers = dict(DEFAULT_HANDLERS) handler_cls = handlers.get(service.encode("ascii"), None) if handler_cls is None: raise web.HTTPForbidden(text="Unsupported service") headers = {"Content-Type": f"application/x-{service}-advertisement"} headers.update(NO_CACHE_HEADERS) response = web.StreamResponse(status=200, headers=headers) await response.prepare(request) out = BytesIO() proto = ReceivableProtocol(BytesIO().read, out.write) handler = handler_cls( DictBackend({b".": cast(BackendRepo, repo)}), [b"."], proto, stateless_rpc=True, advertise_refs=True, ) handler.proto.write_pkt_line(b"# service=" + service.encode("ascii") + b"\n") handler.proto.write_pkt_line(None) # TODO(jelmer): Implement this with proper async code await asyncio.to_thread(handler.handle) await response.write(out.getvalue()) await response.write_eof() return response else: # non-smart fallback headers = {"Content-Type": "text/plain"} headers.update(NO_CACHE_HEADERS) logger.info("Emulating dumb info/refs") return web.Response(body=b"".join(generate_info_refs(repo)), headers=headers) async def get_info_refs(request: web.Request) -> web.StreamResponse | web.Response: """Handle request for /info/refs. Args: request: aiohttp request object Returns: Response with refs information """ repo = request.app[REPO_KEY] return await refs_request(repo, request, request.app[HANDLERS_KEY]) async def get_info_packs(request: web.Request) -> web.Response: """Handle request for /info/packs. Args: request: aiohttp request object Returns: Response with pack information """ headers = {"Content-Type": "text/plain"} headers.update(NO_CACHE_HEADERS) logger.info("Emulating dumb info/packs") return web.Response( body=b"".join(generate_objects_info_packs(request.app[REPO_KEY])), headers=headers, ) async def get_pack_file(request: web.Request) -> web.StreamResponse: """Handle request for a pack file. Args: request: aiohttp request object Returns: Response with the pack file data """ headers = {"Content-Type": "application/x-git-packed-objects"} headers.update(cache_forever_headers()) sha = request.match_info["sha"] path = f"objects/pack/pack-{sha}.pack" logger.info("Sending pack file %s", path) return await send_file( request, request.app[REPO_KEY].get_named_file(path), headers=headers, ) async def get_index_file(request: web.Request) -> web.StreamResponse: """Handle request for a pack index file. Args: request: aiohttp request object Returns: Response with the pack index file data """ headers = {"Content-Type": "application/x-git-packed-objects-toc"} headers.update(cache_forever_headers()) sha = request.match_info["sha"] path = f"objects/pack/pack-{sha}.idx" logger.info("Sending pack file %s", path) return await send_file( request, request.app["repo"].get_named_file(path), headers=headers ) async def service_request( repo: Repo, request: web.Request, handlers: dict[bytes, type] | None = None ) -> web.StreamResponse: """Handle a git service request (upload-pack or receive-pack). Args: repo: Repository object request: aiohttp request object handlers: Optional dict of service handlers Returns: Response with service result """ service = request.match_info["service"] if handlers is None: handlers = dict(DEFAULT_HANDLERS) logger.info("Handling service request for %s", service) handler_cls = handlers.get(service.encode("ascii"), None) if handler_cls is None: raise web.HTTPForbidden(text="Unsupported service") headers = {"Content-Type": f"application/x-{service}-result"} headers.update(NO_CACHE_HEADERS) response = web.StreamResponse(status=200, headers=headers) await response.prepare(request) inf = BytesIO(await request.read()) outf = BytesIO() def handle() -> None: proto = ReceivableProtocol(inf.read, outf.write) handler = handler_cls( DictBackend({b".": cast(BackendRepo, repo)}), [b"."], proto, stateless_rpc=True, ) try: handler.handle() except HangupException: response.force_close() # TODO(jelmer): Implement this with proper async code await asyncio.to_thread(handle) await response.write(outf.getvalue()) await response.write_eof() return response async def handle_service_request(request: web.Request) -> web.StreamResponse: """Handle a service request endpoint. Args: request: aiohttp request object Returns: Response with service result """ repo = request.app[REPO_KEY] return await service_request(repo, request, request.app[HANDLERS_KEY]) def create_repo_app( repo: Repo, handlers: dict[bytes, type] | None = None, dumb: bool = False ) -> web.Application: """Create an aiohttp application for serving a git repository. Args: repo: Repository object to serve handlers: Optional dict of service handlers dumb: Whether to enable dumb HTTP protocol support Returns: Configured aiohttp Application """ app = web.Application() app[REPO_KEY] = repo if handlers is None: handlers = dict(DEFAULT_HANDLERS) app[HANDLERS_KEY] = handlers app[DUMB_KEY] = dumb app.router.add_get("/info/refs", get_info_refs) app.router.add_post( "/{service:git-upload-pack|git-receive-pack}", handle_service_request ) if dumb: app.router.add_get("/{file:HEAD}", get_text_file) app.router.add_get("/{file:objects/info/alternates}", get_text_file) app.router.add_get("/{file:objects/info/http-alternates}", get_text_file) app.router.add_get("/objects/info/packs", get_info_packs) app.router.add_get( "/objects/{dir:[0-9a-f]{2}}/{file:[0-9a-f]{38}}", get_loose_object ) app.router.add_get( "/objects/pack/pack-{sha:[0-9a-f]{40}}\\.pack", get_pack_file ) app.router.add_get( "/objects/pack/pack-{sha:[0-9a-f]{40}}\\.idx", get_index_file ) return app def main(argv: list[str] | None = None) -> None: """Entry point for starting an HTTP git server.""" import argparse parser = argparse.ArgumentParser() parser.add_argument( "-l", "--listen_address", dest="listen_address", default="localhost", help="Binding IP address.", ) parser.add_argument( "-p", "--port", dest="port", type=int, default=8000, help="Port to listen on.", ) parser.add_argument("gitdir", type=str, default=".", nargs="?") args = parser.parse_args(argv) log_utils.default_logging_config() app = create_repo_app(Repo(args.gitdir)) logger.info( "Listening for HTTP connections on %s:%d", args.listen_address, args.port, ) web.run_app(app, port=args.port, host=args.listen_address) if __name__ == "__main__": main(sys.argv[1:]) dulwich-1.0.0/dulwich/annotate.py000066400000000000000000000113241513301442600167710ustar00rootroot00000000000000# annotate.py -- Annotate files with last changed revision # Copyright (C) 2015 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Annotate file contents indicating when they were last changed. Annotated lines are represented as tuples with last modified revision SHA1 and contents. Please note that this is a very naive annotate implementation. It works, but its speed could be improved - in particular because it uses Python's difflib. """ __all__ = ["annotate_lines", "update_lines"] import difflib from collections.abc import Sequence from typing import TYPE_CHECKING from dulwich.objects import Blob from dulwich.walk import ( ORDER_DATE, Walker, ) if TYPE_CHECKING: from dulwich.diff_tree import TreeChange from dulwich.object_store import BaseObjectStore from dulwich.objects import Commit, ObjectID, TreeEntry # Walk over ancestry graph breadth-first # When checking each revision, find lines that according to difflib.Differ() # are common between versions. # Any lines that are not in common were introduced by the newer revision. # If there were no lines kept from the older version, stop going deeper in the # graph. def update_lines( annotated_lines: Sequence[tuple[tuple["Commit", "TreeEntry"], bytes]], new_history_data: tuple["Commit", "TreeEntry"], new_blob: "Blob", ) -> list[tuple[tuple["Commit", "TreeEntry"], bytes]]: """Update annotation lines with old blob lines.""" ret: list[tuple[tuple[Commit, TreeEntry], bytes]] = [] new_lines = new_blob.splitlines() matcher = difflib.SequenceMatcher( a=[line for (h, line) in annotated_lines], b=new_lines ) for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == "equal": ret.extend(annotated_lines[i1:i2]) elif tag in ("insert", "replace"): ret.extend([(new_history_data, line) for line in new_lines[j1:j2]]) elif tag == "delete": pass # don't care else: raise RuntimeError(f"Unknown tag {tag} returned in diff") return ret def annotate_lines( store: "BaseObjectStore", commit_id: "ObjectID", path: bytes, order: str = ORDER_DATE, lines: Sequence[tuple[tuple["Commit", "TreeEntry"], bytes]] | None = None, follow: bool = True, ) -> list[tuple[tuple["Commit", "TreeEntry"], bytes]]: """Annotate the lines of a blob. :param store: Object store to retrieve objects from :param commit_id: Commit id in which to annotate path :param path: Path to annotate :param order: Order in which to process history (defaults to ORDER_DATE) :param lines: Initial lines to compare to (defaults to specified) :param follow: Whether to follow changes across renames/copies :return: List of (commit, line) entries where commit is the oldest commit that changed a line """ walker = Walker( store, include=[commit_id], paths=[path], order=order, follow=follow ) revs: list[tuple[Commit, TreeEntry]] = [] for log_entry in walker: for tree_change in log_entry.changes(): changes: list[TreeChange] if isinstance(tree_change, list): changes = tree_change else: changes = [tree_change] for change in changes: if change.new is not None and change.new.path == path: if change.old is not None and change.old.path is not None: path = change.old.path revs.append((log_entry.commit, change.new)) break lines_annotated: list[tuple[tuple[Commit, TreeEntry], bytes]] = [] for commit, entry in reversed(revs): assert entry.sha is not None blob_obj = store[entry.sha] assert isinstance(blob_obj, Blob) lines_annotated = update_lines(lines_annotated, (commit, entry), blob_obj) return lines_annotated dulwich-1.0.0/dulwich/approxidate.py000066400000000000000000000121161513301442600175000ustar00rootroot00000000000000# approxidate.py -- Parsing of Git's "approxidate" time specifications # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Parsing of Git's "approxidate" time specifications. Git uses a flexible date parser called "approxidate" that accepts various formats for specifying dates and times, including: - Relative times: "yesterday", "2 days ago", "2.weeks.ago" - Absolute dates: "2005-04-07", "2005-04-07 22:13:13" - Unix timestamps: "1234567890" - Special keywords: "now", "today", "yesterday" """ __all__ = ["parse_approxidate", "parse_relative_time"] import time from datetime import datetime def parse_approxidate(time_spec: str | bytes) -> int: """Parse a Git approxidate specification and return a Unix timestamp. Args: time_spec: Time specification string. Can be: - A Unix timestamp (integer as string) - A relative time like "2 weeks ago" or "2.weeks.ago" - Special keywords: "now", "today", "yesterday" - Absolute date: "2005-04-07" or "2005-04-07 22:13:13" Returns: Unix timestamp (seconds since epoch) Raises: ValueError: If the time specification cannot be parsed """ if isinstance(time_spec, bytes): time_spec = time_spec.decode("utf-8") time_spec = time_spec.strip() # Get current time now = time.time() # Handle special keywords if time_spec == "yesterday": return int(now - 86400) elif time_spec == "today": # Start of today (midnight) dt = datetime.fromtimestamp(now) dt = dt.replace(hour=0, minute=0, second=0, microsecond=0) return int(dt.timestamp()) elif time_spec == "now": return int(now) # Try parsing as Unix timestamp try: return int(time_spec) except ValueError: pass # Handle relative time specifications # Supports both "2 weeks ago" and "2.weeks.ago" formats if " ago" in time_spec or ".ago" in time_spec: seconds_ago = parse_relative_time(time_spec) return int(now - seconds_ago) # Try parsing as absolute timestamp formats # Git supports various formats like: # - "2005-04-07" (ISO date) # - "2005-04-07 22:13:13" (ISO datetime) # - "2005-04-07T22:13:13" (ISO 8601) formats = [ "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d", "%Y/%m/%d %H:%M:%S", "%Y/%m/%d", ] for fmt in formats: try: dt = datetime.strptime(time_spec, fmt) return int(dt.timestamp()) except ValueError: continue raise ValueError(f"Unable to parse time specification: {time_spec!r}") def parse_relative_time(time_str: str) -> int: """Parse a relative time string like '2 weeks ago' into seconds. Args: time_str: String like '2 weeks ago', '2.weeks.ago', or 'now' Returns: Number of seconds (relative to current time) Raises: ValueError: If the time string cannot be parsed """ if time_str == "now": return 0 # Normalize dot-separated format to space-separated # "2.weeks.ago" -> "2 weeks ago" normalized = time_str.replace(".ago", " ago").replace(".", " ") if not normalized.endswith(" ago"): raise ValueError(f"Invalid relative time format: {time_str}") parts = normalized[:-4].split() if len(parts) != 2: raise ValueError(f"Invalid relative time format: {time_str}") try: num = int(parts[0]) unit = parts[1] multipliers = { "second": 1, "seconds": 1, "minute": 60, "minutes": 60, "hour": 3600, "hours": 3600, "day": 86400, "days": 86400, "week": 604800, "weeks": 604800, "month": 2592000, # 30 days "months": 2592000, "year": 31536000, # 365 days "years": 31536000, } if unit in multipliers: return num * multipliers[unit] else: raise ValueError(f"Unknown time unit: {unit}") except ValueError as e: if "invalid literal" in str(e): raise ValueError(f"Invalid number in relative time: {parts[0]}") raise dulwich-1.0.0/dulwich/archive.py000066400000000000000000000145071513301442600166070ustar00rootroot00000000000000# archive.py -- Creating an archive from a tarball # Copyright (C) 2015 Jonas Haag # Copyright (C) 2015 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Generates tarballs for Git trees.""" __all__ = ["ChunkedBytesIO", "tar_stream"] import posixpath import stat import struct import tarfile from collections.abc import Generator from contextlib import closing from io import BytesIO from os import SEEK_END from typing import TYPE_CHECKING if TYPE_CHECKING: from .object_store import BaseObjectStore from .objects import TreeEntry from .objects import Tree class ChunkedBytesIO: """Turn a list of bytestrings into a file-like object. This is similar to creating a `BytesIO` from a concatenation of the bytestring list, but saves memory by NOT creating one giant bytestring first:: BytesIO(b''.join(list_of_bytestrings)) =~= ChunkedBytesIO( list_of_bytestrings) """ def __init__(self, contents: list[bytes]) -> None: """Initialize ChunkedBytesIO. Args: contents: List of byte chunks """ self.contents = contents self.pos = (0, 0) def read(self, maxbytes: int | None = None) -> bytes: """Read bytes from the chunked stream. Args: maxbytes: Maximum number of bytes to read (None for all) Returns: Bytes read """ if maxbytes is None or maxbytes < 0: remaining = None else: remaining = maxbytes buf = [] chunk, cursor = self.pos while chunk < len(self.contents): chunk_remainder = len(self.contents[chunk]) - cursor if remaining is not None and remaining < chunk_remainder: buf.append(self.contents[chunk][cursor : cursor + remaining]) cursor += remaining self.pos = (chunk, cursor) break else: buf.append(self.contents[chunk][cursor:]) if remaining is not None: remaining -= chunk_remainder chunk += 1 cursor = 0 self.pos = (chunk, cursor) return b"".join(buf) def tar_stream( store: "BaseObjectStore", tree: "Tree", mtime: int, prefix: bytes = b"", format: str = "", ) -> Generator[bytes, None, None]: """Generate a tar stream for the contents of a Git tree. Returns a generator that lazily assembles a .tar.gz archive, yielding it in pieces (bytestrings). To obtain the complete .tar.gz binary file, simply concatenate these chunks. Args: store: Object store to retrieve objects from tree: Tree object for the tree root mtime: UNIX timestamp that is assigned as the modification time for all files, and the gzip header modification time if format='gz' prefix: Optional prefix to prepend to all paths in the archive format: Optional compression format for tarball Returns: Bytestrings """ buf = BytesIO() mode = "w:" + format if format else "w" from typing import Any, cast # The tarfile.open overloads are complex; cast to Any to avoid issues with closing(cast(Any, tarfile.open)(name=None, mode=mode, fileobj=buf)) as tar: if format == "gz": # Manually correct the gzip header file modification time so that # archives created from the same Git tree are always identical. # The gzip header file modification time is not currently # accessible from the tarfile API, see: # https://bugs.python.org/issue31526 buf.seek(0) assert buf.read(2) == b"\x1f\x8b", "Invalid gzip header" buf.seek(4) buf.write(struct.pack(" Generator[tuple[bytes, "TreeEntry"], None, None]: """Recursively walk a dulwich Tree, yielding tuples of (absolute path, TreeEntry) along the way.""" for entry in tree.iteritems(): assert entry.path is not None entry_abspath = posixpath.join(root, entry.path) assert entry.mode is not None if stat.S_ISDIR(entry.mode): assert entry.sha is not None subtree = store[entry.sha] if isinstance(subtree, Tree): yield from _walk_tree(store, subtree, entry_abspath) else: yield (entry_abspath, entry) dulwich-1.0.0/dulwich/attrs.py000066400000000000000000000322371513301442600163230ustar00rootroot00000000000000# attrs.py -- Git attributes for dulwich # Copyright (C) 2019-2020 Collabora Ltd # Copyright (C) 2019-2020 Andrej Shadura # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Parse .gitattributes file.""" __all__ = [ "AttributeValue", "GitAttributes", "Pattern", "match_path", "parse_git_attributes", "parse_gitattributes_file", "read_gitattributes", ] import os import re from collections.abc import Generator, Iterator, Mapping, Sequence from typing import IO AttributeValue = bytes | bool | None def _parse_attr(attr: bytes) -> tuple[bytes, AttributeValue]: """Parse a git attribute into its value. >>> _parse_attr(b'attr') (b'attr', True) >>> _parse_attr(b'-attr') (b'attr', False) >>> _parse_attr(b'!attr') (b'attr', None) >>> _parse_attr(b'attr=text') (b'attr', b'text') """ if attr.startswith(b"!"): return attr[1:], None if attr.startswith(b"-"): return attr[1:], False if b"=" not in attr: return attr, True # Split only on first = to handle values with = in them name, _, value = attr.partition(b"=") return name, value def parse_git_attributes( f: IO[bytes], ) -> Generator[tuple[bytes, Mapping[bytes, AttributeValue]], None, None]: """Parse a Git attributes string. Args: f: File-like object to read bytes from Returns: List of patterns and corresponding patterns in the order or them being encountered >>> from io import BytesIO >>> list(parse_git_attributes(BytesIO(b'''*.tar.* filter=lfs diff=lfs merge=lfs -text ... ... # store signatures in Git ... *.tar.*.asc -filter -diff merge=binary -text ... ... # store .dsc verbatim ... *.dsc -filter !diff merge=binary !text ... '''))) #doctest: +NORMALIZE_WHITESPACE [(b'*.tar.*', {'filter': 'lfs', 'diff': 'lfs', 'merge': 'lfs', 'text': False}), (b'*.tar.*.asc', {'filter': False, 'diff': False, 'merge': 'binary', 'text': False}), (b'*.dsc', {'filter': False, 'diff': None, 'merge': 'binary', 'text': None})] """ for line in f: line = line.strip() # Ignore blank lines, they're used for readability. if not line: continue if line.startswith(b"#"): # Comment continue pattern, *attrs = line.split() yield (pattern, {k: v for k, v in (_parse_attr(a) for a in attrs)}) def _translate_pattern(pattern: bytes) -> bytes: """Translate a gitattributes pattern to a regular expression. Similar to gitignore patterns, but simpler as gitattributes doesn't support all the same features (e.g., no directory-only patterns with trailing /). """ res = b"" i = 0 n = len(pattern) # If pattern doesn't contain /, it can match at any level if b"/" not in pattern: res = b"(?:.*/)??" elif pattern.startswith(b"/"): # Leading / means root of repository pattern = pattern[1:] n = len(pattern) while i < n: c = pattern[i : i + 1] i += 1 if c == b"*": if i < n and pattern[i : i + 1] == b"*": # Double asterisk i += 1 if i < n and pattern[i : i + 1] == b"/": # **/ - match zero or more directories res += b"(?:.*/)??" i += 1 elif i == n: # ** at end - match everything res += b".*" else: # ** in middle res += b".*" else: # Single * - match any character except / res += b"[^/]*" elif c == b"?": res += b"[^/]" elif c == b"[": # Character class j = i if j < n and pattern[j : j + 1] == b"!": j += 1 if j < n and pattern[j : j + 1] == b"]": j += 1 while j < n and pattern[j : j + 1] != b"]": j += 1 if j >= n: res += b"\\[" else: stuff = pattern[i:j].replace(b"\\", b"\\\\") i = j + 1 if stuff.startswith(b"!"): stuff = b"^" + stuff[1:] elif stuff.startswith(b"^"): stuff = b"\\" + stuff res += b"[" + stuff + b"]" else: res += re.escape(c) return res class Pattern: """A single gitattributes pattern.""" def __init__(self, pattern: bytes): """Initialize GitAttributesPattern. Args: pattern: Attribute pattern as bytes """ self.pattern = pattern self._regex: re.Pattern[bytes] | None = None self._compile() def _compile(self) -> None: """Compile the pattern to a regular expression.""" regex_pattern = _translate_pattern(self.pattern) # Add anchors regex_pattern = b"^" + regex_pattern + b"$" self._regex = re.compile(regex_pattern) def match(self, path: bytes) -> bool: """Check if path matches this pattern. Args: path: Path to check (relative to repository root, using / separators) Returns: True if path matches this pattern """ # Normalize path if path.startswith(b"/"): path = path[1:] # Try to match assert self._regex is not None # Always set by _compile() return bool(self._regex.match(path)) def match_path( patterns: Sequence[tuple[Pattern, Mapping[bytes, AttributeValue]]], path: bytes ) -> dict[bytes, AttributeValue]: """Get attributes for a path by matching against patterns. Args: patterns: List of (Pattern, attributes) tuples path: Path to match (relative to repository root) Returns: Dictionary of attributes that apply to this path """ attributes: dict[bytes, AttributeValue] = {} # Later patterns override earlier ones for pattern, attrs in patterns: if pattern.match(path): # Update attributes for name, value in attrs.items(): if value is None: # Unspecified - remove the attribute attributes.pop(name, None) else: attributes[name] = value return attributes def parse_gitattributes_file( filename: str | bytes, ) -> list[tuple[Pattern, Mapping[bytes, AttributeValue]]]: """Parse a gitattributes file and return compiled patterns. Args: filename: Path to the .gitattributes file Returns: List of (Pattern, attributes) tuples """ patterns = [] if isinstance(filename, str): filename = filename.encode("utf-8") with open(filename, "rb") as f: for pattern_bytes, attrs in parse_git_attributes(f): pattern = Pattern(pattern_bytes) patterns.append((pattern, attrs)) return patterns def read_gitattributes( path: str | bytes, ) -> list[tuple[Pattern, Mapping[bytes, AttributeValue]]]: """Read .gitattributes from a directory. Args: path: Directory path to check for .gitattributes Returns: List of (Pattern, attributes) tuples """ if isinstance(path, bytes): path = path.decode("utf-8") gitattributes_path = os.path.join(path, ".gitattributes") if os.path.exists(gitattributes_path): return parse_gitattributes_file(gitattributes_path) return [] class GitAttributes: """A collection of gitattributes patterns that can match paths.""" def __init__( self, patterns: list[tuple[Pattern, Mapping[bytes, AttributeValue]]] | None = None, ): """Initialize GitAttributes. Args: patterns: Optional list of (Pattern, attributes) tuples """ self._patterns = patterns or [] def match_path(self, path: bytes) -> dict[bytes, AttributeValue]: """Get attributes for a path by matching against patterns. Args: path: Path to match (relative to repository root) Returns: Dictionary of attributes that apply to this path """ return match_path(self._patterns, path) def add_patterns( self, patterns: Sequence[tuple[Pattern, Mapping[bytes, AttributeValue]]] ) -> None: """Add patterns to the collection. Args: patterns: List of (Pattern, attributes) tuples to add """ self._patterns.extend(patterns) def __len__(self) -> int: """Return the number of patterns.""" return len(self._patterns) def __iter__(self) -> Iterator[tuple["Pattern", Mapping[bytes, AttributeValue]]]: """Iterate over patterns.""" return iter(self._patterns) @classmethod def from_file(cls, filename: str | bytes) -> "GitAttributes": """Create GitAttributes from a gitattributes file. Args: filename: Path to the .gitattributes file Returns: New GitAttributes instance """ patterns = parse_gitattributes_file(filename) return cls(patterns) @classmethod def from_path(cls, path: str | bytes) -> "GitAttributes": """Create GitAttributes from .gitattributes in a directory. Args: path: Directory path to check for .gitattributes Returns: New GitAttributes instance """ patterns = read_gitattributes(path) return cls(patterns) def set_attribute(self, pattern: bytes, name: bytes, value: AttributeValue) -> None: """Set an attribute for a pattern. Args: pattern: The file pattern name: Attribute name value: Attribute value (bytes, True, False, or None) """ # Find existing pattern pattern_obj = None attrs_dict: dict[bytes, AttributeValue] | None = None pattern_index = -1 for i, (p, attrs) in enumerate(self._patterns): if p.pattern == pattern: pattern_obj = p # Convert to mutable dict attrs_dict = dict(attrs) pattern_index = i break if pattern_obj is None: # Create new pattern pattern_obj = Pattern(pattern) attrs_dict = {name: value} self._patterns.append((pattern_obj, attrs_dict)) else: # Update the existing pattern in the list assert pattern_index >= 0 assert attrs_dict is not None self._patterns[pattern_index] = (pattern_obj, attrs_dict) # Update the attribute if attrs_dict is None: raise AssertionError("attrs_dict should not be None at this point") attrs_dict[name] = value def remove_pattern(self, pattern: bytes) -> None: """Remove all attributes for a pattern. Args: pattern: The file pattern to remove """ self._patterns = [ (p, attrs) for p, attrs in self._patterns if p.pattern != pattern ] def to_bytes(self) -> bytes: """Convert GitAttributes to bytes format suitable for writing to file. Returns: Bytes representation of the gitattributes file """ lines = [] for pattern_obj, attrs in self._patterns: pattern = pattern_obj.pattern attr_strs = [] for name, value in sorted(attrs.items()): if value is True: attr_strs.append(name) elif value is False: attr_strs.append(b"-" + name) elif value is None: attr_strs.append(b"!" + name) else: # value is bytes attr_strs.append(name + b"=" + value) if attr_strs: line = pattern + b" " + b" ".join(attr_strs) lines.append(line) return b"\n".join(lines) + b"\n" if lines else b"" def write_to_file(self, filename: str | bytes) -> None: """Write GitAttributes to a file. Args: filename: Path to write the .gitattributes file """ if isinstance(filename, str): filename = filename.encode("utf-8") content = self.to_bytes() with open(filename, "wb") as f: f.write(content) dulwich-1.0.0/dulwich/bisect.py000066400000000000000000000362771513301442600164470ustar00rootroot00000000000000# bisect.py -- Git bisect algorithm implementation # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git bisect implementation.""" __all__ = ["BisectState"] import os from collections.abc import Sequence, Set from dulwich.object_store import peel_sha from dulwich.objects import Commit, ObjectID from dulwich.refs import HEADREF, Ref from dulwich.repo import Repo class BisectState: """Manages the state of a bisect session.""" def __init__(self, repo: Repo) -> None: """Initialize BisectState. Args: repo: Repository to perform bisect on """ self.repo = repo self._bisect_dir = os.path.join(repo.controldir(), "BISECT_START") @property def is_active(self) -> bool: """Check if a bisect session is active.""" return os.path.exists(self._bisect_dir) def start( self, bad: ObjectID | None = None, good: Sequence[ObjectID] | None = None, paths: Sequence[bytes] | None = None, no_checkout: bool = False, term_bad: str = "bad", term_good: str = "good", ) -> None: """Start a new bisect session. Args: bad: The bad commit SHA (defaults to HEAD) good: List of good commit SHAs paths: Optional paths to limit bisect to no_checkout: If True, don't checkout commits during bisect term_bad: Term to use for bad commits (default: "bad") term_good: Term to use for good commits (default: "good") """ if self.is_active: raise ValueError("Bisect session already in progress") # Create bisect state directory bisect_refs_dir = os.path.join(self.repo.controldir(), "refs", "bisect") os.makedirs(bisect_refs_dir, exist_ok=True) # Store current branch/commit try: ref_chain, sha = self.repo.refs.follow(HEADREF) if sha is None: # No HEAD exists raise ValueError("Cannot start bisect: repository has no HEAD") # Use the first non-HEAD ref in the chain, or the SHA itself current_branch: Ref | ObjectID if len(ref_chain) > 1: current_branch = ref_chain[1] # The actual branch ref else: current_branch = sha # Detached HEAD except KeyError: # Detached HEAD try: current_branch = self.repo.head() except KeyError: # No HEAD exists - can't start bisect raise ValueError("Cannot start bisect: repository has no HEAD") # Write BISECT_START with open(self._bisect_dir, "wb") as f: f.write(current_branch) # Write BISECT_TERMS terms_file = os.path.join(self.repo.controldir(), "BISECT_TERMS") with open(terms_file, "w") as f: f.write(f"{term_bad}\n{term_good}\n") # Write BISECT_NAMES (paths) names_file = os.path.join(self.repo.controldir(), "BISECT_NAMES") with open(names_file, "w") as f: if paths: f.write( "\n".join(path.decode("utf-8", "replace") for path in paths) + "\n" ) else: f.write("\n") # Initialize BISECT_LOG log_file = os.path.join(self.repo.controldir(), "BISECT_LOG") with open(log_file, "w") as f: f.write("git bisect start\n") f.write("# status: waiting for both good and bad commits\n") # Mark bad commit if provided if bad is not None: self.mark_bad(bad) # Mark good commits if provided if good: for g in good: self.mark_good(g) def mark_bad(self, rev: ObjectID | None = None) -> ObjectID | None: """Mark a commit as bad. Args: rev: Commit SHA to mark as bad (defaults to HEAD) Returns: The SHA of the next commit to test, or None if bisect is complete """ if not self.is_active: raise ValueError("No bisect session in progress") if rev is None: rev = self.repo.head() else: rev = peel_sha(self.repo.object_store, rev)[1].id # Write bad ref bad_ref_path = os.path.join(self.repo.controldir(), "refs", "bisect", "bad") with open(bad_ref_path, "wb") as f: f.write(rev + b"\n") # Update log self._append_to_log( f"# bad: [{rev.decode('ascii')}] {self._get_commit_subject(rev)}" ) self._append_to_log(f"git bisect bad {rev.decode('ascii')}") return self._find_next_commit() def mark_good(self, rev: ObjectID | None = None) -> ObjectID | None: """Mark a commit as good. Args: rev: Commit SHA to mark as good (defaults to HEAD) Returns: The SHA of the next commit to test, or None if bisect is complete """ if not self.is_active: raise ValueError("No bisect session in progress") if rev is None: rev = self.repo.head() else: rev = peel_sha(self.repo.object_store, rev)[1].id # Write good ref good_ref_path = os.path.join( self.repo.controldir(), "refs", "bisect", f"good-{rev.decode('ascii')}" ) with open(good_ref_path, "wb") as f: f.write(rev + b"\n") # Update log self._append_to_log( f"# good: [{rev.decode('ascii')}] {self._get_commit_subject(rev)}" ) self._append_to_log(f"git bisect good {rev.decode('ascii')}") return self._find_next_commit() def skip(self, revs: Sequence[ObjectID] | None = None) -> ObjectID | None: """Skip one or more commits. Args: revs: List of commits to skip (defaults to [HEAD]) Returns: The SHA of the next commit to test, or None if bisect is complete """ if not self.is_active: raise ValueError("No bisect session in progress") if revs is None: revs = [self.repo.head()] for rev in revs: rev = peel_sha(self.repo.object_store, rev)[1].id skip_ref_path = os.path.join( self.repo.controldir(), "refs", "bisect", f"skip-{rev.decode('ascii')}" ) with open(skip_ref_path, "wb") as f: f.write(rev + b"\n") self._append_to_log(f"git bisect skip {rev.decode('ascii')}") return self._find_next_commit() def reset(self, commit: ObjectID | None = None) -> None: """Reset bisect state and return to original branch/commit. Args: commit: Optional commit to reset to (defaults to original branch/commit) """ if not self.is_active: raise ValueError("No bisect session in progress") # Read original branch/commit with open(self._bisect_dir, "rb") as f: original = f.read().strip() # Clean up bisect files for filename in [ "BISECT_START", "BISECT_TERMS", "BISECT_NAMES", "BISECT_LOG", "BISECT_EXPECTED_REV", "BISECT_ANCESTORS_OK", ]: filepath = os.path.join(self.repo.controldir(), filename) if os.path.exists(filepath): os.remove(filepath) # Clean up refs/bisect directory bisect_refs_dir = os.path.join(self.repo.controldir(), "refs", "bisect") if os.path.exists(bisect_refs_dir): for filename in os.listdir(bisect_refs_dir): os.remove(os.path.join(bisect_refs_dir, filename)) os.rmdir(bisect_refs_dir) # Reset to target commit/branch if commit is None: if original.startswith(b"refs/"): # It's a branch reference - need to create a symbolic ref self.repo.refs.set_symbolic_ref(HEADREF, Ref(original)) else: # It's a commit SHA self.repo.refs[HEADREF] = ObjectID(original) else: commit = peel_sha(self.repo.object_store, commit)[1].id self.repo.refs[HEADREF] = commit def get_log(self) -> str: """Get the bisect log.""" if not self.is_active: raise ValueError("No bisect session in progress") log_file = os.path.join(self.repo.controldir(), "BISECT_LOG") with open(log_file) as f: return f.read() def replay(self, log_content: str) -> None: """Replay a bisect log. Args: log_content: The bisect log content to replay """ # Parse and execute commands from log for line in log_content.splitlines(): line = line.strip() if line.startswith("#") or not line: continue parts = line.split() if len(parts) < 3 or parts[0] != "git" or parts[1] != "bisect": continue cmd = parts[2] args = parts[3:] if len(parts) > 3 else [] if cmd == "start": self.start() elif cmd == "bad": rev = ObjectID(args[0].encode("ascii")) if args else None self.mark_bad(rev) elif cmd == "good": rev = ObjectID(args[0].encode("ascii")) if args else None self.mark_good(rev) elif cmd == "skip": revs = [ObjectID(arg.encode("ascii")) for arg in args] if args else None self.skip(revs) def _find_next_commit(self) -> ObjectID | None: """Find the next commit to test using binary search. Returns: The SHA of the next commit to test, or None if bisect is complete """ # Get bad commit bad_ref_path = os.path.join(self.repo.controldir(), "refs", "bisect", "bad") if not os.path.exists(bad_ref_path): self._append_to_log("# status: waiting for both good and bad commits") return None with open(bad_ref_path, "rb") as f: bad_sha = ObjectID(f.read().strip()) # Get all good commits good_shas: list[ObjectID] = [] bisect_refs_dir = os.path.join(self.repo.controldir(), "refs", "bisect") for filename in os.listdir(bisect_refs_dir): if filename.startswith("good-"): with open(os.path.join(bisect_refs_dir, filename), "rb") as f: good_shas.append(ObjectID(f.read().strip())) if not good_shas: self._append_to_log( "# status: waiting for good commit(s), bad commit known" ) return None # Get skip commits skip_shas: set[ObjectID] = set() for filename in os.listdir(bisect_refs_dir): if filename.startswith("skip-"): with open(os.path.join(bisect_refs_dir, filename), "rb") as f: skip_shas.add(ObjectID(f.read().strip())) # Find commits between good and bad candidates = self._find_bisect_candidates(bad_sha, good_shas, skip_shas) if not candidates: # Bisect complete - the first bad commit is found self._append_to_log( f"# first bad commit: [{bad_sha.decode('ascii')}] " f"{self._get_commit_subject(bad_sha)}" ) return None # Find midpoint mid_idx = len(candidates) // 2 next_commit = candidates[mid_idx] # Write BISECT_EXPECTED_REV expected_file = os.path.join(self.repo.controldir(), "BISECT_EXPECTED_REV") with open(expected_file, "wb") as f: f.write(next_commit + b"\n") # Update status in log steps_remaining = self._estimate_steps(len(candidates)) self._append_to_log( f"Bisecting: {len(candidates) - 1} revisions left to test after this " f"(roughly {steps_remaining} steps)" ) self._append_to_log( f"[{next_commit.decode('ascii')}] {self._get_commit_subject(next_commit)}" ) return next_commit def _find_bisect_candidates( self, bad_sha: ObjectID, good_shas: Sequence[ObjectID], skip_shas: Set[ObjectID] ) -> list[ObjectID]: """Find all commits between good and bad commits. Args: bad_sha: The bad commit SHA good_shas: List of good commit SHAs skip_shas: Set of commits to skip Returns: List of candidate commit SHAs in topological order """ # Use git's graph walking to find commits # This is a simplified version - a full implementation would need # to handle merge commits properly candidates: list[ObjectID] = [] visited: set[ObjectID] = set(good_shas) queue: list[ObjectID] = [bad_sha] while queue: sha = queue.pop(0) if sha in visited or sha in skip_shas: continue visited.add(sha) commit = self.repo.object_store[sha] # Don't include good commits if sha not in good_shas: candidates.append(sha) # Add parents to queue if isinstance(commit, Commit): for parent in commit.parents: if parent not in visited: queue.append(parent) # Remove the bad commit itself if bad_sha in candidates: candidates.remove(bad_sha) return candidates def _get_commit_subject(self, sha: ObjectID) -> str: """Get the subject line of a commit message.""" obj = self.repo.object_store[sha] if isinstance(obj, Commit): message = obj.message.decode("utf-8", errors="replace") lines = message.split("\n") return lines[0] if lines else "" return "" def _append_to_log(self, line: str) -> None: """Append a line to the bisect log.""" log_file = os.path.join(self.repo.controldir(), "BISECT_LOG") with open(log_file, "a") as f: f.write(line + "\n") def _estimate_steps(self, num_candidates: int) -> int: """Estimate the number of steps remaining in bisect.""" if num_candidates <= 1: return 0 steps = 0 while num_candidates > 1: num_candidates //= 2 steps += 1 return steps dulwich-1.0.0/dulwich/bitmap.py000066400000000000000000001144471513301442600164460ustar00rootroot00000000000000# bitmap.py -- Packfile bitmap support for git # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Support for Git packfile bitmaps. Bitmaps store reachability information for packfiles, enabling faster object counting and enumeration operations without full graph traversal. The bitmap format uses EWAH (Enhanced Word-Aligned Hybrid) compression for efficient storage and fast bitwise operations. """ __all__ = [ "BITMAP_OPT_FULL_DAG", "BITMAP_OPT_HASH_CACHE", "BITMAP_OPT_LOOKUP_TABLE", "BITMAP_OPT_PSEUDO_MERGES", "BITMAP_SIGNATURE", "BITMAP_VERSION", "DEFAULT_COMMIT_INTERVAL", "MAX_LITERAL_WORDS", "MAX_XOR_OFFSET", "BitmapEntry", "EWAHBitmap", "PackBitmap", "apply_xor_compression", "bitmap_to_object_shas", "build_name_hash_cache", "build_reachability_bitmap", "build_type_bitmaps", "find_commit_bitmaps", "generate_bitmap", "read_bitmap", "read_bitmap_file", "select_bitmap_commits", "write_bitmap", "write_bitmap_file", ] import os import struct from collections import deque from collections.abc import Callable, Iterable, Iterator from io import BytesIO from typing import IO, TYPE_CHECKING from .file import GitFile from .objects import ( Blob, Commit, ObjectID, RawObjectID, Tag, Tree, hex_to_sha, sha_to_hex, ) if TYPE_CHECKING: from .object_store import BaseObjectStore from .pack import Pack, PackIndex from .refs import Ref # Bitmap file signature BITMAP_SIGNATURE = b"BITM" # Bitmap format version BITMAP_VERSION = 1 # Bitmap flags BITMAP_OPT_FULL_DAG = 0x1 # Full closure BITMAP_OPT_HASH_CACHE = 0x4 # Name-hash cache BITMAP_OPT_LOOKUP_TABLE = 0x10 # Lookup table for random access BITMAP_OPT_PSEUDO_MERGES = 0x20 # Pseudo-merge bitmaps # EWAH compression constants MAX_LITERAL_WORDS = 0x7FFFFFFF # Maximum literal words in EWAH format (31 bits) MAX_XOR_OFFSET = 160 # Maximum distance to search for XOR compression base DEFAULT_COMMIT_INTERVAL = 100 # Default interval for commit selection def _encode_ewah_words(words: list[int]) -> list[int]: """Encode a list of 64-bit words using EWAH run-length compression. Args: words: List of 64-bit words to encode Returns: List of compressed words (RLWs followed by literals) """ compressed_words = [] i = 0 while i < len(words): # Check for runs of all zeros or all ones if words[i] == 0 or words[i] == 0xFFFFFFFFFFFFFFFF: # Count consecutive identical words run_value = words[i] run_length = 0 while i < len(words) and words[i] == run_value: run_length += 1 i += 1 # Collect following literal words literals = [] while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF: literals.append(words[i]) i += 1 if len(literals) >= MAX_LITERAL_WORDS: break # Create RLW with correct bit layout: # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)] running_bit = 1 if run_value == 0xFFFFFFFFFFFFFFFF else 0 rlw = (len(literals) << 33) | (run_length << 1) | running_bit compressed_words.append(rlw) compressed_words.extend(literals) else: # Collect literal words literals = [] while i < len(words) and words[i] != 0 and words[i] != 0xFFFFFFFFFFFFFFFF: literals.append(words[i]) i += 1 if len(literals) >= MAX_LITERAL_WORDS: break # RLW with no run, just literals # [literal_words(31 bits)][running_len(32 bits)][running_bit(1 bit)] rlw = (len(literals) << 33) | (0 << 1) | 0 compressed_words.append(rlw) compressed_words.extend(literals) return compressed_words class EWAHBitmap: """EWAH (Enhanced Word-Aligned Hybrid) compressed bitmap. EWAH uses run-length encoding for efficient bitmap storage. Each bitmap consists of: - Uncompressed bit count (4 bytes) - Compressed word count (4 bytes) - Compressed words (8 bytes each) - Current RLW position (4 bytes) Each Run Length Word (RLW) 64-bit layout (LSB to MSB): - Bit 0: running_bit (1 bit) - value of repeated words (0 or 1) - Bits 1-32: running_len (32 bits) - count of repeated words - Bits 33-63: literal_words (31 bits) - count of literal words following this RLW """ def __init__(self, data: bytes | None = None) -> None: """Initialize EWAH bitmap. Args: data: Optional compressed bitmap data to decode """ self.bits: set[int] = set() self.bit_count = 0 if data: self._decode(data) def _decode(self, data: bytes) -> None: """Decode EWAH compressed bitmap data. Args: data: Compressed bitmap data (EWAH format with header + words + RLW position) """ f = BytesIO(data) # Read header bit_count_bytes = f.read(4) word_count_bytes = f.read(4) if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4: return bit_count = struct.unpack(">I", bit_count_bytes)[0] word_count = struct.unpack(">I", word_count_bytes)[0] self.bit_count = bit_count current_bit = 0 # Read all words first words = [] for _ in range(word_count): word_bytes = f.read(8) if len(word_bytes) < 8: break word = struct.unpack(">Q", word_bytes)[0] words.append(word) # Process EWAH chunks: RLW followed by literal words idx = 0 while idx < len(words): # This is an RLW # Bit layout: [literal_words(31)][running_len(32)][running_bit(1)] rlw = words[idx] running_bit = rlw & 1 running_len = (rlw >> 1) & 0xFFFFFFFF literal_words = rlw >> 33 idx += 1 # Process running bits if running_len > 0: if running_bit == 1: # Add all bits in the repeated section for i in range(running_len * 64): self.bits.add(current_bit + i) current_bit += running_len * 64 # Process literal words for _ in range(literal_words): if idx >= len(words): break literal = words[idx] idx += 1 # Extract set bits from literal word for i in range(64): if literal & (1 << i): self.bits.add(current_bit + i) current_bit += 64 # Read RLW position (we don't use it currently, but it's part of the format) f.read(4) def encode(self) -> bytes: """Encode bitmap to EWAH compressed format. Returns: Compressed bitmap data including header, words, and RLW position """ if not self.bits: # Empty bitmap: bit_count=0, word_count=0, rlw_pos=0 return struct.pack(">III", 0, 0, 0) max_bit = max(self.bits) if self.bits else 0 bit_count = max_bit + 1 word_count = (bit_count + 63) // 64 # Create literal words words = [0] * word_count for bit in self.bits: word_idx = bit // 64 bit_idx = bit % 64 words[word_idx] |= 1 << bit_idx # Compress using EWAH run-length encoding compressed_words = _encode_ewah_words(words) # Build EWAH data f = BytesIO() # Header f.write(struct.pack(">I", bit_count)) f.write(struct.pack(">I", len(compressed_words))) # Write compressed words for word in compressed_words: f.write(struct.pack(">Q", word)) # Write RLW position (position of last RLW in the compressed words) # For now, we'll use 0 as we don't track this during encoding # This could be improved in the future if needed f.write(struct.pack(">I", 0)) return f.getvalue() def __contains__(self, bit: int) -> bool: """Check if a bit is set. Args: bit: Bit position to check Returns: True if bit is set, False otherwise """ return bit in self.bits def __len__(self) -> int: """Return the number of set bits. Returns: Count of set bits """ return len(self.bits) def __or__(self, other: "EWAHBitmap") -> "EWAHBitmap": """Bitwise OR operation. Args: other: Other bitmap to OR with Returns: New bitmap with OR result """ result = EWAHBitmap() result.bits = self.bits | other.bits result.bit_count = max(self.bit_count, other.bit_count) return result def __and__(self, other: "EWAHBitmap") -> "EWAHBitmap": """Bitwise AND operation. Args: other: Other bitmap to AND with Returns: New bitmap with AND result """ result = EWAHBitmap() result.bits = self.bits & other.bits result.bit_count = max(self.bit_count, other.bit_count) return result def __xor__(self, other: "EWAHBitmap") -> "EWAHBitmap": """Bitwise XOR operation. Args: other: Other bitmap to XOR with Returns: New bitmap with XOR result """ result = EWAHBitmap() result.bits = self.bits ^ other.bits result.bit_count = max(self.bit_count, other.bit_count) return result def __sub__(self, other: "EWAHBitmap") -> "EWAHBitmap": """Bitwise subtraction (set difference). Returns bits that are in self but not in other. Equivalent to: self & ~other Args: other: Bitmap to subtract Returns: New bitmap with bits in self but not in other """ result = EWAHBitmap() result.bits = self.bits - other.bits result.bit_count = self.bit_count return result def add(self, bit: int) -> None: """Set a bit. Args: bit: Bit position to set """ self.bits.add(bit) self.bit_count = max(self.bit_count, bit + 1) class BitmapEntry: """A single bitmap entry for a commit.""" def __init__( self, object_pos: int, xor_offset: int, flags: int, bitmap: EWAHBitmap, ) -> None: """Initialize a bitmap entry. Args: object_pos: Position of object in pack index xor_offset: XOR offset for compression flags: Entry flags bitmap: The EWAH bitmap data """ self.object_pos = object_pos self.xor_offset = xor_offset self.flags = flags self.bitmap = bitmap class PackBitmap: """A pack bitmap index. Bitmaps store reachability information for commits in a packfile, allowing fast object enumeration without graph traversal. """ def __init__( self, version: int = BITMAP_VERSION, flags: int = BITMAP_OPT_FULL_DAG, ) -> None: """Initialize a pack bitmap. Args: version: Bitmap format version flags: Bitmap flags """ self.version = version self.flags = flags self.pack_checksum: bytes | None = None # Type bitmaps for commits, trees, blobs, tags self.commit_bitmap = EWAHBitmap() self.tree_bitmap = EWAHBitmap() self.blob_bitmap = EWAHBitmap() self.tag_bitmap = EWAHBitmap() # Bitmap entries indexed by commit SHA self.entries: dict[bytes, BitmapEntry] = {} # List of entries in order (for XOR offset resolution) self.entries_list: list[tuple[bytes, BitmapEntry]] = [] # Optional lookup table for random access self.lookup_table: list[tuple[int, int, int]] | None = None # Optional name-hash cache self.name_hash_cache: list[int] | None = None def get_bitmap(self, commit_sha: bytes) -> EWAHBitmap | None: """Get the bitmap for a commit. Args: commit_sha: SHA-1 of the commit Returns: EWAH bitmap or None if not found """ entry = self.entries.get(commit_sha) if entry is None: return None # Decompress using XOR if needed if entry.xor_offset > 0: # Find the entry at the XOR offset # The XOR offset tells us how many entries back to look # We need to find this entry in the ordered list try: current_idx = next( i for i, (sha, _) in enumerate(self.entries_list) if sha == commit_sha ) except StopIteration: # Entry not found in list, return as-is return entry.bitmap # XOR offset is how many positions back to look if current_idx >= entry.xor_offset: base_sha, _base_entry = self.entries_list[ current_idx - entry.xor_offset ] # Get the base bitmap (recursively if it also uses XOR) base_bitmap = self.get_bitmap(base_sha) if base_bitmap is not None: # XOR the current bitmap with the base return entry.bitmap ^ base_bitmap return entry.bitmap def has_commit(self, commit_sha: bytes) -> bool: """Check if a commit has a bitmap. Args: commit_sha: SHA-1 of the commit Returns: True if bitmap exists for this commit """ return commit_sha in self.entries def iter_commits(self) -> Iterator[bytes]: """Iterate over all commits with bitmaps. Returns: Iterator of commit SHAs """ return iter(self.entries.keys()) def read_bitmap( filename: str | os.PathLike[str], pack_index: "PackIndex | None" = None, ) -> PackBitmap: """Read a bitmap index file. Args: filename: Path to the .bitmap file pack_index: Optional PackIndex to resolve object positions to SHAs Returns: Loaded PackBitmap Raises: ValueError: If file format is invalid ChecksumMismatch: If checksum verification fails """ with GitFile(filename, "rb") as f: return read_bitmap_file(f, pack_index=pack_index) def read_bitmap_file(f: IO[bytes], pack_index: "PackIndex | None" = None) -> PackBitmap: """Read bitmap data from a file object. Args: f: File object to read from pack_index: Optional PackIndex to resolve object positions to SHAs Returns: Loaded PackBitmap Raises: ValueError: If file format is invalid """ # Read header signature = f.read(4) if signature != BITMAP_SIGNATURE: raise ValueError( f"Invalid bitmap signature: {signature!r}, expected {BITMAP_SIGNATURE!r}" ) version_bytes = f.read(2) flags_bytes = f.read(2) if len(version_bytes) < 2 or len(flags_bytes) < 2: raise ValueError("Incomplete bitmap header") version = struct.unpack(">H", version_bytes)[0] flags = struct.unpack(">H", flags_bytes)[0] if version != BITMAP_VERSION: raise ValueError(f"Unsupported bitmap version: {version}") # Read entry count entry_count_bytes = f.read(4) if len(entry_count_bytes) < 4: raise ValueError("Missing entry count") entry_count = struct.unpack(">I", entry_count_bytes)[0] # Read pack checksum pack_checksum = f.read(20) if len(pack_checksum) < 20: raise ValueError("Missing pack checksum") bitmap = PackBitmap(version=version, flags=flags) bitmap.pack_checksum = pack_checksum # Read type bitmaps (EWAH bitmaps are self-describing) for i, type_bitmap in enumerate( [ bitmap.commit_bitmap, bitmap.tree_bitmap, bitmap.blob_bitmap, bitmap.tag_bitmap, ] ): # EWAH format: # 4 bytes: bit count # 4 bytes: word count # N x 8 bytes: compressed words # 4 bytes: RLW position # Read header to determine size bit_count_bytes = f.read(4) word_count_bytes = f.read(4) if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4: raise ValueError(f"Missing type bitmap {i} header") word_count = struct.unpack(">I", word_count_bytes)[0] # Read compressed words words_data = f.read(word_count * 8) if len(words_data) < word_count * 8: raise ValueError(f"Incomplete type bitmap {i} data") # Read RLW position rlw_pos_bytes = f.read(4) if len(rlw_pos_bytes) < 4: raise ValueError(f"Missing type bitmap {i} RLW position") # Reconstruct the full EWAH data to pass to _decode ewah_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes type_bitmap._decode(ewah_data) # Read bitmap entries for _ in range(entry_count): # Read object position (4 bytes) obj_pos_bytes = f.read(4) if len(obj_pos_bytes) < 4: raise ValueError("Incomplete bitmap entry") obj_pos = struct.unpack(">I", obj_pos_bytes)[0] # Read XOR offset (1 byte) xor_offset_bytes = f.read(1) if len(xor_offset_bytes) < 1: raise ValueError("Missing XOR offset") xor_offset = xor_offset_bytes[0] # Read flags (1 byte) flags_bytes = f.read(1) if len(flags_bytes) < 1: raise ValueError("Missing entry flags") entry_flags = flags_bytes[0] # Read self-describing EWAH bitmap # EWAH format: bit_count (4) + word_count (4) + words + rlw_pos (4) bit_count_bytes = f.read(4) word_count_bytes = f.read(4) if len(bit_count_bytes) < 4 or len(word_count_bytes) < 4: raise ValueError("Incomplete bitmap entry EWAH header") word_count = struct.unpack(">I", word_count_bytes)[0] # Read compressed words words_data = f.read(word_count * 8) if len(words_data) < word_count * 8: raise ValueError("Incomplete bitmap entry EWAH words") # Read RLW position rlw_pos_bytes = f.read(4) if len(rlw_pos_bytes) < 4: raise ValueError("Missing bitmap entry EWAH RLW position") # Reconstruct full EWAH data bitmap_data = bit_count_bytes + word_count_bytes + words_data + rlw_pos_bytes # Create bitmap entry ewah_bitmap = EWAHBitmap(bitmap_data) if word_count > 0 else EWAHBitmap() entry = BitmapEntry( object_pos=obj_pos, xor_offset=xor_offset, flags=entry_flags, bitmap=ewah_bitmap, ) # Resolve object position to SHA if we have a pack index if pack_index is not None: # Get the SHA at the given position in the sorted index sha = None for idx, (entry_sha, _offset, _crc32) in enumerate( pack_index.iterentries() ): if idx == obj_pos: sha = entry_sha break if sha is not None: bitmap.entries[sha] = entry bitmap.entries_list.append((sha, entry)) else: # Without pack index, use position as temporary key temp_key = obj_pos.to_bytes(4, byteorder="big") bitmap.entries[temp_key] = entry bitmap.entries_list.append((temp_key, entry)) # Read optional lookup table if flags & BITMAP_OPT_LOOKUP_TABLE: # Lookup table contains triplets: (commit_pos, offset, xor_row) # Number of entries matches the bitmap entry count lookup_table = [] for _ in range(entry_count): # Read commit position (4 bytes) commit_pos_bytes = f.read(4) if len(commit_pos_bytes) < 4: break commit_pos = struct.unpack(">I", commit_pos_bytes)[0] # Read file offset (8 bytes) offset_bytes = f.read(8) if len(offset_bytes) < 8: break offset = struct.unpack(">Q", offset_bytes)[0] # Read XOR row (4 bytes) xor_row_bytes = f.read(4) if len(xor_row_bytes) < 4: break xor_row = struct.unpack(">I", xor_row_bytes)[0] lookup_table.append((commit_pos, offset, xor_row)) bitmap.lookup_table = lookup_table # Read optional name-hash cache if flags & BITMAP_OPT_HASH_CACHE: # Name-hash cache contains one 32-bit hash per object in the pack # The number of hashes depends on the total number of objects # For now, we'll read what's available name_hash_cache = [] while True: hash_bytes = f.read(4) if len(hash_bytes) < 4: break hash_value = struct.unpack(">I", hash_bytes)[0] name_hash_cache.append(hash_value) if name_hash_cache: bitmap.name_hash_cache = name_hash_cache return bitmap def write_bitmap( filename: str | os.PathLike[str], bitmap: PackBitmap, ) -> None: """Write a bitmap index file. Args: filename: Path to write the .bitmap file bitmap: PackBitmap to write """ with GitFile(filename, "wb") as f: write_bitmap_file(f, bitmap) def write_bitmap_file(f: IO[bytes], bitmap: PackBitmap) -> None: """Write bitmap data to a file object. Args: f: File object to write to bitmap: PackBitmap to write """ # Write header f.write(BITMAP_SIGNATURE) f.write(struct.pack(">H", bitmap.version)) f.write(struct.pack(">H", bitmap.flags)) # Write entry count f.write(struct.pack(">I", len(bitmap.entries))) # Write pack checksum if bitmap.pack_checksum: f.write(bitmap.pack_checksum) else: f.write(b"\x00" * 20) # Write type bitmaps (self-describing EWAH format, no size prefix needed) for type_bitmap in [ bitmap.commit_bitmap, bitmap.tree_bitmap, bitmap.blob_bitmap, bitmap.tag_bitmap, ]: data = type_bitmap.encode() f.write(data) # Write bitmap entries for _sha, entry in bitmap.entries.items(): # Write object position (4 bytes) f.write(struct.pack(">I", entry.object_pos)) # Write XOR offset (1 byte) f.write(bytes([entry.xor_offset])) # Write flags (1 byte) f.write(bytes([entry.flags])) # Write compressed bitmap data (self-describing EWAH format, no size prefix) bitmap_data = entry.bitmap.encode() f.write(bitmap_data) # Write optional lookup table if bitmap.flags & BITMAP_OPT_LOOKUP_TABLE and bitmap.lookup_table: for commit_pos, offset, xor_row in bitmap.lookup_table: f.write(struct.pack(">I", commit_pos)) # 4 bytes f.write(struct.pack(">Q", offset)) # 8 bytes f.write(struct.pack(">I", xor_row)) # 4 bytes # Write optional name-hash cache if bitmap.flags & BITMAP_OPT_HASH_CACHE and bitmap.name_hash_cache: for hash_value in bitmap.name_hash_cache: f.write(struct.pack(">I", hash_value)) def _compute_name_hash(name: bytes) -> int: """Compute the name hash for a tree entry. This is the same algorithm Git uses for the name-hash cache. Args: name: The name of the tree entry Returns: 32-bit hash value """ hash_value = 0 for byte in name: hash_value = (hash_value >> 19) | (hash_value << 13) hash_value += byte hash_value &= 0xFFFFFFFF return hash_value def select_bitmap_commits( refs: dict["Ref", ObjectID], object_store: "BaseObjectStore", commit_interval: int = DEFAULT_COMMIT_INTERVAL, ) -> list[ObjectID]: """Select commits for bitmap generation. Uses Git's strategy: - All branch and tag tips - Every Nth commit in history Args: refs: Dictionary of ref names to commit SHAs object_store: Object store to read commits from commit_interval: Include every Nth commit in history Returns: List of commit SHAs to create bitmaps for """ selected = set() seen = set() # Start with all refs ref_commits = set() for ref_name, sha in refs.items(): try: obj = object_store[sha] except KeyError: continue else: # Dereference tags to get to commits while isinstance(obj, Tag): obj = object_store[obj.object[1]] if isinstance(obj, Commit): ref_commits.add(obj.id) # Add all ref tips selected.update(ref_commits) # Walk the commit graph and select every Nth commit queue = deque(ref_commits) commit_count = 0 while queue: commit_sha = queue.popleft() if commit_sha in seen: continue seen.add(commit_sha) try: obj = object_store[commit_sha] if not isinstance(obj, Commit): continue commit_count += 1 if commit_count % commit_interval == 0: selected.add(commit_sha) # Add parents to queue for parent in obj.parents: if parent not in seen: queue.append(parent) except KeyError: continue return sorted(selected) def build_reachability_bitmap( commit_sha: ObjectID, sha_to_pos: dict[RawObjectID, int], object_store: "BaseObjectStore", ) -> EWAHBitmap: """Build a reachability bitmap for a commit. The bitmap has a bit set for each object that is reachable from the commit. The bit position corresponds to the object's position in the pack index. Args: commit_sha: The commit to build a bitmap for sha_to_pos: Pre-built mapping from SHA to position in pack object_store: Object store to traverse objects Returns: EWAH bitmap with bits set for reachable objects """ bitmap = EWAHBitmap() # Traverse all objects reachable from the commit seen = set() queue = deque([commit_sha]) while queue: sha = queue.popleft() if sha in seen: continue seen.add(sha) # Add this object to the bitmap if it's in the pack # Convert hex SHA to binary for pack index lookup raw_sha = hex_to_sha(sha) if raw_sha in sha_to_pos: bitmap.add(sha_to_pos[raw_sha]) # Get the object and traverse its references try: obj = object_store[sha] if isinstance(obj, Commit): # Add parents and tree queue.append(obj.tree) queue.extend(obj.parents) elif hasattr(obj, "items"): # Tree object - add all entries for item in obj.items(): queue.append(item.sha) except KeyError: # Object not in store, skip it continue return bitmap def apply_xor_compression( bitmaps: list[tuple[ObjectID, EWAHBitmap]], max_xor_offset: int = MAX_XOR_OFFSET, ) -> list[tuple[ObjectID, EWAHBitmap, int]]: """Apply XOR compression to bitmaps. XOR compression stores some bitmaps as XOR differences from previous bitmaps, reducing storage size when bitmaps are similar. Args: bitmaps: List of (commit_sha, bitmap) tuples max_xor_offset: Maximum offset to search for XOR base Returns: List of (commit_sha, bitmap, xor_offset) tuples """ compressed = [] for i, (sha, bitmap) in enumerate(bitmaps): best_xor_offset = 0 best_size = len(bitmap.encode()) best_xor_bitmap = bitmap # Try XORing with previous bitmaps within max_xor_offset for offset in range(1, min(i + 1, max_xor_offset + 1)): _prev_sha, prev_bitmap = bitmaps[i - offset] xor_bitmap = bitmap ^ prev_bitmap xor_size = len(xor_bitmap.encode()) # Use XOR if it reduces size if xor_size < best_size: best_size = xor_size best_xor_offset = offset best_xor_bitmap = xor_bitmap compressed.append((sha, best_xor_bitmap, best_xor_offset)) return compressed def build_type_bitmaps( sha_to_pos: dict["RawObjectID", int], object_store: "BaseObjectStore", ) -> tuple[EWAHBitmap, EWAHBitmap, EWAHBitmap, EWAHBitmap]: """Build type bitmaps for all objects in a pack. Type bitmaps classify objects by type: commit, tree, blob, or tag. Args: sha_to_pos: Pre-built mapping from SHA to position in pack object_store: Object store to read object types Returns: Tuple of (commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap) """ commit_bitmap = EWAHBitmap() tree_bitmap = EWAHBitmap() blob_bitmap = EWAHBitmap() tag_bitmap = EWAHBitmap() for sha, pos in sha_to_pos.items(): # Pack index returns binary SHA (20 bytes), but object_store expects hex SHA (40 bytes) hex_sha = sha_to_hex(sha) if len(sha) == 20 else ObjectID(sha) try: obj = object_store[hex_sha] except KeyError: # Object not in store, skip it continue obj_type = obj.type_num if obj_type == Commit.type_num: commit_bitmap.add(pos) elif obj_type == Tree.type_num: tree_bitmap.add(pos) elif obj_type == Blob.type_num: blob_bitmap.add(pos) elif obj_type == Tag.type_num: tag_bitmap.add(pos) return commit_bitmap, tree_bitmap, blob_bitmap, tag_bitmap def build_name_hash_cache( sha_to_pos: dict["RawObjectID", int], object_store: "BaseObjectStore", ) -> list[int]: """Build name-hash cache for all objects in a pack. The name-hash cache stores a hash of the name for each object, which can speed up path-based operations. Args: sha_to_pos: Pre-built mapping from SHA to position in pack object_store: Object store to read objects Returns: List of 32-bit hash values, one per object in the pack """ # Pre-allocate list with correct size num_objects = len(sha_to_pos) name_hashes = [0] * num_objects for sha, pos in sha_to_pos.items(): # Pack index returns binary SHA (20 bytes), but object_store expects hex SHA (40 bytes) hex_sha = sha_to_hex(sha) if len(sha) == 20 else ObjectID(sha) try: obj = object_store[hex_sha] except KeyError: # Object not in store, use zero hash continue # For tree entries, use the tree entry name # For commits, use the tree SHA # For other objects, use the object SHA if isinstance(obj, Tree): # Tree object - use the SHA as the name name_hash = _compute_name_hash(sha) elif isinstance(obj, Commit): # Commit - use the tree SHA as the name name_hash = _compute_name_hash(obj.tree) else: # Other objects - use the SHA as the name name_hash = _compute_name_hash(sha) name_hashes[pos] = name_hash return name_hashes def generate_bitmap( pack_index: "PackIndex", object_store: "BaseObjectStore", refs: dict["Ref", ObjectID], pack_checksum: bytes, include_hash_cache: bool = True, include_lookup_table: bool = True, commit_interval: int | None = None, progress: Callable[[str], None] | None = None, ) -> PackBitmap: """Generate a complete bitmap for a pack. Args: pack_index: Pack index for the pack object_store: Object store to read objects from refs: Dictionary of ref names to commit SHAs pack_checksum: SHA-1 checksum of the pack file include_hash_cache: Whether to include name-hash cache include_lookup_table: Whether to include lookup table commit_interval: Include every Nth commit in history (None for default) progress: Optional progress reporting callback Returns: Complete PackBitmap ready to write to disk """ if commit_interval is None: commit_interval = DEFAULT_COMMIT_INTERVAL if progress: progress("Building pack index mapping") # Build mapping from SHA to position in pack index ONCE # This is used by all subsequent operations and avoids repeated enumeration sha_to_pos: dict[RawObjectID, int] = {} for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()): sha_to_pos[sha] = pos if progress: progress("Selecting commits for bitmap") # Select commits to create bitmaps for selected_commits = select_bitmap_commits(refs, object_store, commit_interval) if progress: progress(f"Building bitmaps for {len(selected_commits)} commits") # Build reachability bitmaps for selected commits commit_bitmaps = [] for i, commit_sha in enumerate(selected_commits): if progress and i % 10 == 0: progress(f"Building bitmap {i + 1}/{len(selected_commits)}") bitmap = build_reachability_bitmap(commit_sha, sha_to_pos, object_store) commit_bitmaps.append((commit_sha, bitmap)) if progress: progress("Applying XOR compression") # Apply XOR compression compressed_bitmaps = apply_xor_compression(commit_bitmaps) if progress: progress("Building type bitmaps") # Build type bitmaps (using pre-built sha_to_pos mapping) commit_type_bitmap, tree_type_bitmap, blob_type_bitmap, tag_type_bitmap = ( build_type_bitmaps(sha_to_pos, object_store) ) # Create PackBitmap flags = BITMAP_OPT_FULL_DAG if include_hash_cache: flags |= BITMAP_OPT_HASH_CACHE if include_lookup_table: flags |= BITMAP_OPT_LOOKUP_TABLE pack_bitmap = PackBitmap(version=1, flags=flags) pack_bitmap.pack_checksum = pack_checksum pack_bitmap.commit_bitmap = commit_type_bitmap pack_bitmap.tree_bitmap = tree_type_bitmap pack_bitmap.blob_bitmap = blob_type_bitmap pack_bitmap.tag_bitmap = tag_type_bitmap # Add bitmap entries for commit_sha, xor_bitmap, xor_offset in compressed_bitmaps: raw_commit_sha = hex_to_sha(commit_sha) if raw_commit_sha not in sha_to_pos: continue entry = BitmapEntry( object_pos=sha_to_pos[raw_commit_sha], xor_offset=xor_offset, flags=0, bitmap=xor_bitmap, ) pack_bitmap.entries[commit_sha] = entry pack_bitmap.entries_list.append((commit_sha, entry)) # Build optional name-hash cache (using pre-built sha_to_pos mapping) if include_hash_cache: if progress: progress("Building name-hash cache") pack_bitmap.name_hash_cache = build_name_hash_cache(sha_to_pos, object_store) # Build optional lookup table if include_lookup_table: if progress: progress("Building lookup table") # The lookup table is built automatically from the entries # For now, we'll leave it as None and let the write function handle it # TODO: Implement lookup table generation if needed pack_bitmap.lookup_table = None if progress: progress("Bitmap generation complete") return pack_bitmap def find_commit_bitmaps( commit_shas: set["ObjectID"], packs: Iterable["Pack"] ) -> dict["ObjectID", tuple["Pack", "PackBitmap", dict[RawObjectID, int]]]: """Find which packs have bitmaps for the given commits. Args: commit_shas: Set of commit SHAs to look for packs: Iterable of Pack objects to search Returns: Dict mapping commit SHA to (pack, pack_bitmap, position) tuple """ result = {} remaining = set(commit_shas) for pack in packs: if not remaining: break pack_bitmap = pack.bitmap if not pack_bitmap: # No bitmap for this pack continue # Build SHA to position mapping for this pack sha_to_pos: dict[RawObjectID, int] = {} for pos, (sha, _offset, _crc32) in enumerate(pack.index.iterentries()): sha_to_pos[sha] = pos # Check which commits have bitmaps for commit_sha in list(remaining): if pack_bitmap.has_commit(commit_sha): raw_commit_sha = hex_to_sha(commit_sha) if raw_commit_sha in sha_to_pos: result[commit_sha] = (pack, pack_bitmap, sha_to_pos) remaining.remove(commit_sha) return result def bitmap_to_object_shas( bitmap: EWAHBitmap, pack_index: "PackIndex", type_filter: EWAHBitmap | None = None, ) -> set[ObjectID]: """Convert a bitmap to a set of object SHAs. Args: bitmap: The EWAH bitmap with set bits for objects pack_index: Pack index to map positions to SHAs type_filter: Optional type bitmap to filter results (e.g., commits only) Returns: Set of object SHAs (hex format) """ result: set[ObjectID] = set() for pos, (sha, _offset, _crc32) in enumerate(pack_index.iterentries()): # Check if this position is in the bitmap if pos in bitmap: # Apply type filter if provided if type_filter is None or pos in type_filter: result.add(sha_to_hex(sha)) return result dulwich-1.0.0/dulwich/bundle.py000066400000000000000000000301751513301442600164360ustar00rootroot00000000000000# bundle.py -- Bundle format support # Copyright (C) 2020 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Bundle format support.""" __all__ = [ "Bundle", "PackDataLike", "create_bundle_from_repo", "read_bundle", "write_bundle", ] import types from collections.abc import Callable, Iterator, Sequence from typing import ( TYPE_CHECKING, BinaryIO, Protocol, cast, runtime_checkable, ) if TYPE_CHECKING: from .object_format import ObjectFormat from .objects import ObjectID from .pack import PackData, UnpackedObject, write_pack_data from .refs import Ref @runtime_checkable class PackDataLike(Protocol): """Protocol for objects that behave like PackData.""" object_format: "ObjectFormat" def __len__(self) -> int: """Return the number of objects in the pack.""" ... def iter_unpacked(self) -> Iterator[UnpackedObject]: """Iterate over unpacked objects in the pack.""" ... def close(self) -> None: """Close any open resources.""" ... if TYPE_CHECKING: from .object_store import BaseObjectStore from .repo import BaseRepo class Bundle: """Git bundle object representation.""" version: int | None capabilities: dict[str, str | None] prerequisites: list[tuple[ObjectID, bytes]] references: dict[Ref, ObjectID] pack_data: PackDataLike | None def __repr__(self) -> str: """Return string representation of Bundle.""" return ( f"<{type(self).__name__}(version={self.version}, " f"capabilities={self.capabilities}, " f"prerequisites={self.prerequisites}, " f"references={self.references})>" ) def __eq__(self, other: object) -> bool: """Check equality with another Bundle.""" if not isinstance(other, type(self)): return False if self.version != other.version: return False if self.capabilities != other.capabilities: return False if self.prerequisites != other.prerequisites: return False if self.references != other.references: return False if self.pack_data != other.pack_data: return False return True def close(self) -> None: """Close any open resources in this bundle.""" if self.pack_data is not None: self.pack_data.close() self.pack_data = None def __enter__(self) -> "Bundle": """Enter context manager.""" return self def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: types.TracebackType | None, ) -> None: """Exit context manager and close bundle.""" self.close() def __del__(self) -> None: """Warn if bundle was not explicitly closed.""" if self.pack_data is not None: import warnings warnings.warn( f"Bundle {self!r} was not explicitly closed. " "Please use bundle.close() or a context manager.", ResourceWarning, stacklevel=2, ) def store_objects( self, object_store: "BaseObjectStore", progress: Callable[[str], None] | None = None, ) -> None: """Store all objects from this bundle into an object store. Args: object_store: The object store to add objects to progress: Optional progress callback function """ from .objects import ShaFile if self.pack_data is None: raise ValueError("pack_data is not loaded") count = 0 for unpacked in self.pack_data.iter_unpacked(): # Convert the unpacked object to a proper git object if unpacked.decomp_chunks and unpacked.obj_type_num is not None: git_obj = ShaFile.from_raw_chunks( unpacked.obj_type_num, unpacked.decomp_chunks ) object_store.add_object(git_obj) count += 1 if progress and count % 100 == 0: progress(f"Stored {count} objects") if progress: progress(f"Stored {count} objects total") def _read_bundle(f: BinaryIO, version: int) -> Bundle: capabilities = {} prerequisites = [] references: dict[Ref, ObjectID] = {} line = f.readline() if version >= 3: while line.startswith(b"@"): line = line[1:].rstrip(b"\n") try: key, value_bytes = line.split(b"=", 1) value = value_bytes.decode("utf-8") except ValueError: key = line value = None capabilities[key.decode("utf-8")] = value line = f.readline() while line.startswith(b"-"): (obj_id, comment) = line[1:].rstrip(b"\n").split(b" ", 1) prerequisites.append((ObjectID(obj_id), comment)) line = f.readline() while line != b"\n": (obj_id, ref) = line.rstrip(b"\n").split(b" ", 1) references[Ref(ref)] = ObjectID(obj_id) line = f.readline() # Extract pack data to separate stream since PackData expects # the file to start with PACK header at position 0 pack_bytes = f.read() if not pack_bytes: raise ValueError("Bundle file contains no pack data") from io import BytesIO from .object_format import DEFAULT_OBJECT_FORMAT pack_file = BytesIO(pack_bytes) # TODO: Support specifying object format based on bundle metadata pack_data = PackData.from_file(pack_file, object_format=DEFAULT_OBJECT_FORMAT) ret = Bundle() ret.references = references ret.capabilities = capabilities ret.prerequisites = prerequisites ret.pack_data = pack_data ret.version = version return ret def read_bundle(f: BinaryIO) -> Bundle: """Read a bundle file. Args: f: A seekable binary file-like object. The file must remain open for the lifetime of the returned Bundle object. """ if not hasattr(f, "seek"): raise ValueError("Bundle file must be seekable") firstline = f.readline() if firstline == b"# v2 git bundle\n": return _read_bundle(f, 2) if firstline == b"# v3 git bundle\n": return _read_bundle(f, 3) raise AssertionError(f"unsupported bundle format header: {firstline!r}") def write_bundle(f: BinaryIO, bundle: Bundle) -> None: """Write a bundle to a file. Args: f: File-like object to write to bundle: Bundle object to write """ version = bundle.version if version is None: if bundle.capabilities: version = 3 else: version = 2 if version == 2: f.write(b"# v2 git bundle\n") elif version == 3: f.write(b"# v3 git bundle\n") else: raise AssertionError(f"unknown version {version}") if version == 3: for key, value in bundle.capabilities.items(): f.write(b"@" + key.encode("utf-8")) if value is not None: f.write(b"=" + value.encode("utf-8")) f.write(b"\n") for obj_id, comment in bundle.prerequisites: f.write(b"-" + obj_id + b" " + comment + b"\n") for ref, obj_id in bundle.references.items(): f.write(obj_id + b" " + ref + b"\n") f.write(b"\n") if bundle.pack_data is None: raise ValueError("bundle.pack_data is not loaded") write_pack_data( cast(Callable[[bytes], None], f.write), num_records=len(bundle.pack_data), records=bundle.pack_data.iter_unpacked(), object_format=bundle.pack_data.object_format, ) def create_bundle_from_repo( repo: "BaseRepo", refs: Sequence[Ref] | None = None, prerequisites: Sequence[bytes] | None = None, version: int | None = None, capabilities: dict[str, str | None] | None = None, progress: Callable[[str], None] | None = None, ) -> Bundle: """Create a bundle from a repository. Args: repo: Repository object to create bundle from refs: List of refs to include (defaults to all refs) prerequisites: List of commit SHAs that are prerequisites version: Bundle version (2 or 3, auto-detected if None) capabilities: Bundle capabilities (for v3 bundles) progress: Optional progress reporting function Returns: Bundle object ready for writing """ if refs is None: refs = list(repo.refs.keys()) if prerequisites is None: prerequisites = [] if capabilities is None: capabilities = {} # Build the references dictionary for the bundle bundle_refs: dict[Ref, ObjectID] = {} want_objects: set[ObjectID] = set() for ref in refs: if ref in repo.refs: ref_value = repo.refs[ref] # Handle peeled refs try: peeled_value = repo.refs.get_peeled(ref) if peeled_value is not None and peeled_value != ref_value: bundle_refs[ref] = peeled_value else: bundle_refs[ref] = ref_value except KeyError: bundle_refs[ref] = ref_value want_objects.add(bundle_refs[ref]) # Convert prerequisites to proper format bundle_prerequisites = [] have_objects: set[ObjectID] = set() for prereq in prerequisites: if not isinstance(prereq, bytes): raise TypeError( f"Invalid prerequisite type: {type(prereq)}, expected bytes" ) if len(prereq) != 40: raise ValueError( f"Invalid prerequisite SHA length: {len(prereq)}, expected 40 hex characters" ) try: # Validate it's actually hex bytes.fromhex(prereq.decode("utf-8")) except ValueError: raise ValueError(f"Invalid prerequisite format: {prereq!r}") # Store hex in bundle and for pack generation bundle_prerequisites.append((ObjectID(prereq), b"")) have_objects.add(ObjectID(prereq)) # Generate pack data containing all objects needed for the refs pack_count, pack_objects = repo.generate_pack_data( have=have_objects, want=want_objects, progress=progress, ) # Store the pack objects directly, we'll write them when saving the bundle # For now, create a simple wrapper to hold the data class _BundlePackData: def __init__( self, count: int, objects: Iterator[UnpackedObject], object_format: "ObjectFormat", ) -> None: self._count = count self._objects = list(objects) # Materialize the iterator self.object_format = object_format def __len__(self) -> int: return self._count def iter_unpacked(self) -> Iterator[UnpackedObject]: return iter(self._objects) def close(self) -> None: """Close pack data (no-op for in-memory pack data).""" pack_data = _BundlePackData(pack_count, pack_objects, repo.object_format) # Create bundle object bundle = Bundle() bundle.version = version bundle.capabilities = capabilities bundle.prerequisites = bundle_prerequisites bundle.references = bundle_refs bundle.pack_data = pack_data return bundle dulwich-1.0.0/dulwich/cli.py000077500000000000000000006772371513301442600157570ustar00rootroot00000000000000# # dulwich - Simple command-line interface to Dulwich # Copyright (C) 2008-2011 Jelmer Vernooij # vim: expandtab # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Simple command-line interface to Dulwich>. This is a very simple command-line wrapper for Dulwich. It is by no means intended to be a full-blown Git command-line interface but just a way to test Dulwich. """ __all__ = [ "AutoFlushBinaryIOWrapper", "AutoFlushTextIOWrapper", "Command", "CommitMessageError", "Pager", "PagerBuffer", "SuperCommand", "detect_terminal_width", "disable_pager", "enable_pager", "format_bytes", "format_columns", "get_pager", "launch_editor", "main", "parse_time_to_timestamp", "signal_int", "signal_quit", "to_display_str", "write_columns", ] # TODO: Add support for GIT_NAMESPACE environment variable by wrapping # repository refs with NamespacedRefsContainer when the environment # variable is set. See issue #1809 and dulwich.refs.NamespacedRefsContainer. import argparse import io import logging import os import shutil import signal import subprocess import sys import tempfile import types from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from pathlib import Path from types import TracebackType from typing import ( BinaryIO, ClassVar, TextIO, ) from dulwich import porcelain from dulwich._typing import Buffer from dulwich.refs import HEADREF, Ref from .bundle import Bundle, create_bundle_from_repo, read_bundle, write_bundle from .client import get_transport_and_path from .config import Config from .errors import ( ApplyDeltaError, FileFormatException, GitProtocolError, NotGitRepository, ) from .index import Index from .log_utils import _configure_logging_from_trace from .objects import Commit, ObjectID, RawObjectID, sha_to_hex, valid_hexsha from .objectspec import parse_commit_range from .pack import Pack from .patch import DiffAlgorithmNotAvailable from .repo import Repo logger = logging.getLogger(__name__) def to_display_str(value: bytes | str) -> str: """Convert a bytes or string value to a display string. Args: value: The value to convert (bytes or str) Returns: A string suitable for display """ if isinstance(value, bytes): return value.decode("utf-8", "replace") return value def _should_auto_flush( stream: TextIO | BinaryIO, env: Mapping[str, str] | None = None ) -> bool: """Determine if output should be auto-flushed based on GIT_FLUSH environment variable. Args: stream: The output stream to check env: Environment variables dict (defaults to os.environ) Returns: True if output should be flushed after each write, False otherwise """ if env is None: env = os.environ git_flush = env.get("GIT_FLUSH", "").strip() if git_flush == "1": return True elif git_flush == "0": return False else: # Auto-detect: don't flush if redirected to a file return hasattr(stream, "isatty") and not stream.isatty() class AutoFlushTextIOWrapper: """Wrapper that automatically flushes a TextIO stream based on configuration. This wrapper can be configured to flush after each write operation, which is useful for real-time output monitoring in CI/CD systems. """ def __init__(self, stream: TextIO) -> None: """Initialize the wrapper. Args: stream: The stream to wrap """ self._stream = stream @classmethod def env( cls, stream: TextIO, env: Mapping[str, str] | None = None ) -> "AutoFlushTextIOWrapper | TextIO": """Create wrapper respecting the GIT_FLUSH environment variable. Respects the GIT_FLUSH environment variable: - GIT_FLUSH=1: Always flush after each write - GIT_FLUSH=0: Never auto-flush (use buffered I/O) - Not set: Auto-detect based on whether output is redirected Args: stream: The stream to wrap env: Environment variables dict (defaults to os.environ) Returns: AutoFlushTextIOWrapper instance configured based on GIT_FLUSH """ if _should_auto_flush(stream, env): return cls(stream) else: return stream def write(self, data: str) -> int: """Write data to the stream and optionally flush. Args: data: Data to write Returns: Number of characters written """ result = self._stream.write(data) self._stream.flush() return result def writelines(self, lines: Iterable[str]) -> None: """Write multiple lines to the stream and optionally flush. Args: lines: Lines to write """ self._stream.writelines(lines) self._stream.flush() def flush(self) -> None: """Flush the underlying stream.""" self._stream.flush() def __getattr__(self, name: str) -> object: """Delegate all other attributes to the underlying stream.""" return getattr(self._stream, name) def __enter__(self) -> "AutoFlushTextIOWrapper": """Support context manager protocol.""" return self def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: """Support context manager protocol.""" if hasattr(self._stream, "__exit__"): self._stream.__exit__(exc_type, exc_val, exc_tb) class AutoFlushBinaryIOWrapper: """Wrapper that automatically flushes a BinaryIO stream based on configuration. This wrapper can be configured to flush after each write operation, which is useful for real-time output monitoring in CI/CD systems. """ def __init__(self, stream: BinaryIO) -> None: """Initialize the wrapper. Args: stream: The stream to wrap """ self._stream = stream @classmethod def env( cls, stream: BinaryIO, env: Mapping[str, str] | None = None ) -> "AutoFlushBinaryIOWrapper | BinaryIO": """Create wrapper respecting the GIT_FLUSH environment variable. Respects the GIT_FLUSH environment variable: - GIT_FLUSH=1: Always flush after each write - GIT_FLUSH=0: Never auto-flush (use buffered I/O) - Not set: Auto-detect based on whether output is redirected Args: stream: The stream to wrap env: Environment variables dict (defaults to os.environ) Returns: AutoFlushBinaryIOWrapper instance configured based on GIT_FLUSH """ if _should_auto_flush(stream, env): return cls(stream) else: return stream def write(self, data: Buffer) -> int: """Write data to the stream and optionally flush. Args: data: Data to write Returns: Number of bytes written """ result = self._stream.write(data) self._stream.flush() return result def writelines(self, lines: Iterable[Buffer]) -> None: """Write multiple lines to the stream and optionally flush. Args: lines: Lines to write """ self._stream.writelines(lines) self._stream.flush() def flush(self) -> None: """Flush the underlying stream.""" self._stream.flush() def __getattr__(self, name: str) -> object: """Delegate all other attributes to the underlying stream.""" return getattr(self._stream, name) def __enter__(self) -> "AutoFlushBinaryIOWrapper": """Support context manager protocol.""" return self def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: """Support context manager protocol.""" if hasattr(self._stream, "__exit__"): self._stream.__exit__(exc_type, exc_val, exc_tb) class CommitMessageError(Exception): """Raised when there's an issue with the commit message.""" def signal_int(signal: int, frame: types.FrameType | None) -> None: """Handle interrupt signal by exiting. Args: signal: Signal number frame: Current stack frame """ sys.exit(1) def signal_quit(signal: int, frame: types.FrameType | None) -> None: """Handle quit signal by entering debugger. Args: signal: Signal number frame: Current stack frame """ import pdb pdb.set_trace() def parse_time_to_timestamp(time_spec: str) -> int: """Parse a time specification and return a Unix timestamp. Args: time_spec: Time specification. Can be: - A Unix timestamp (integer as string) - A relative time like "2 weeks ago" - "now" for current time - "all" to expire all entries (returns future time) - "never" to never expire (returns 0 - epoch start) Returns: Unix timestamp Raises: ValueError: If the time specification cannot be parsed """ import time from .approxidate import parse_approxidate # Handle special cases specific to CLI if time_spec == "all": # Expire all entries - set to future time so everything is "older" return int(time.time()) + (100 * 365 * 24 * 60 * 60) # 100 years in future if time_spec == "never": # Never expire - set to epoch start so nothing is older return 0 # Use approxidate parser for everything else return parse_approxidate(time_spec) def format_bytes(bytes: float) -> str: """Format bytes as human-readable string. Args: bytes: Number of bytes Returns: Human-readable string like "1.5 MB" """ for unit in ["B", "KB", "MB", "GB"]: if bytes < 1024.0: return f"{bytes:.1f} {unit}" bytes /= 1024.0 return f"{bytes:.1f} TB" def launch_editor(template_content: bytes = b"") -> bytes: """Launch an editor for the user to enter text. Args: template_content: Initial content for the editor Returns: The edited content as bytes """ # Determine which editor to use editor = os.environ.get("GIT_EDITOR") or os.environ.get("EDITOR") or "vi" # Create a temporary file with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".txt") as f: temp_file = f.name f.write(template_content) try: # Launch the editor subprocess.run([editor, temp_file], check=True) # Read the edited content with open(temp_file, "rb") as f: content = f.read() return content finally: # Clean up the temporary file os.unlink(temp_file) def detect_terminal_width() -> int: """Detect the width of the terminal. Returns: Width of the terminal in characters, or 80 if it cannot be determined """ try: return os.get_terminal_size().columns except OSError: return 80 def write_columns( items: Iterator[bytes] | Sequence[bytes], out: TextIO, width: int | None = None, ) -> None: """Display items in formatted columns based on terminal width. Args: items: List or iterator of bytes objects to display in columns out: Output stream to write to width: Optional width of the terminal (if None, auto-detect) The function calculates the optimal number of columns to fit the terminal width and displays the items in a formatted column layout with proper padding and alignment. """ if width is None: ter_width = detect_terminal_width() else: ter_width = width item_names = [item.decode() for item in items] def columns( names: Sequence[str], width: int, num_cols: int ) -> tuple[bool, list[int]]: if num_cols <= 0: return False, [] num_rows = (len(names) + num_cols - 1) // num_cols col_widths = [] for col in range(num_cols): max_width = 0 for row in range(num_rows): idx = row + col * num_rows if idx < len(names): max_width = max(max_width, len(names[idx])) col_widths.append(max_width + 2) # add padding total_width = sum(col_widths) if total_width <= width: return True, col_widths return False, [] best_cols = 1 best_widths = [] for num_cols in range(min(8, len(item_names)), 0, -1): fits, widths = columns(item_names, ter_width, num_cols) if fits: best_cols = num_cols best_widths = widths break if not best_widths: best_cols = 1 best_widths = [max(len(name) for name in item_names) + 2] num_rows = (len(item_names) + best_cols - 1) // best_cols for row in range(num_rows): lines = [] for col in range(best_cols): idx = row + col * num_rows if idx < len(item_names): branch_name = item_names[idx] if col < len(best_widths): lines.append(branch_name.ljust(best_widths[col])) else: lines.append(branch_name) if lines: out.write("".join(lines).rstrip() + "\n") def format_columns( items: list[str], width: int | None = None, mode: str = "column", padding: int = 1, indent: str = "", nl: str = "\n", ) -> str: r"""Format items into columns with various layout modes. Args: items: List of strings to format width: Terminal width (auto-detected if None) mode: Layout mode - "column" (fill columns first), "row" (fill rows first), "plain" (one column), or add ",dense" for unequal column widths padding: Number of spaces between columns indent: String to prepend to each line nl: String to append to each line (including newline) Returns: Formatted string with items in columns Examples: >>> format_columns(["a", "b", "c"], width=20, mode="column") "a b\\nc\\n" >>> format_columns(["a", "b", "c"], width=20, mode="row") "a b c\\n" """ if not items: return "" if width is None: width = detect_terminal_width() # Parse mode mode_parts = mode.split(",") layout_mode = "column" dense = False for part in mode_parts: part = part.strip() if part in ("column", "row", "plain"): layout_mode = part elif part == "dense": dense = True elif part == "nodense": dense = False # Plain mode - one item per line if layout_mode == "plain": return "".join(indent + item + nl for item in items) # Calculate available width for content (excluding indent) available_width = width - len(indent) if available_width <= 0: available_width = width # Find optimal number of columns max_item_len = max(len(item) for item in items) # Start with maximum possible columns and work down best_num_cols = 1 best_col_widths: list[int] = [] for num_cols in range(min(len(items), 20), 0, -1): if layout_mode == "column": # Column mode: fill columns first (items go down, then across) num_rows = (len(items) + num_cols - 1) // num_cols else: # row mode # Row mode: fill rows first (items go across, then down) num_rows = (len(items) + num_cols - 1) // num_cols col_widths: list[int] = [] if dense: # Calculate width for each column based on its contents for col in range(num_cols): max_width = 0 for row in range(num_rows): if layout_mode == "column": idx = row + col * num_rows else: # row mode idx = row * num_cols + col if idx < len(items): max_width = max(max_width, len(items[idx])) if max_width > 0: col_widths.append(max_width) else: # All columns same width (nodense) max_width = 0 for col in range(num_cols): for row in range(num_rows): if layout_mode == "column": idx = row + col * num_rows else: # row mode idx = row * num_cols + col if idx < len(items): max_width = max(max_width, len(items[idx])) col_widths = [max_width] * num_cols # Calculate total width including padding (but not after last column) total_width = sum(col_widths) + padding * (len(col_widths) - 1) if total_width <= available_width: best_num_cols = num_cols best_col_widths = col_widths break # If no fit found, use single column if not best_col_widths: best_num_cols = 1 best_col_widths = [max_item_len] # Format output num_rows = (len(items) + best_num_cols - 1) // best_num_cols lines = [] for row in range(num_rows): line_parts = [] for col in range(best_num_cols): if layout_mode == "column": idx = row + col * num_rows else: # row mode idx = row * best_num_cols + col if idx < len(items): item = items[idx] # Pad item to column width, except for last column in row if col < best_num_cols - 1 and col < len(best_col_widths) - 1: item = item.ljust(best_col_widths[col] + padding) line_parts.append(item) if line_parts: lines.append(indent + "".join(line_parts).rstrip() + nl) return "".join(lines) class PagerBuffer(BinaryIO): """Binary buffer wrapper for Pager to mimic sys.stdout.buffer.""" def __init__(self, pager: "Pager") -> None: """Initialize PagerBuffer. Args: pager: Pager instance to wrap """ self.pager = pager def write(self, data: bytes | bytearray | memoryview) -> int: # type: ignore[override] """Write bytes to pager.""" # Convert to bytes and decode to string for the pager text = bytes(data).decode("utf-8", errors="replace") return self.pager.write(text) def flush(self) -> None: """Flush the pager.""" return self.pager.flush() def writelines(self, lines: Iterable[bytes | bytearray | memoryview]) -> None: # type: ignore[override] """Write multiple lines to pager.""" for line in lines: self.write(line) def readable(self) -> bool: """Return whether the buffer is readable (it's not).""" return False def writable(self) -> bool: """Return whether the buffer is writable.""" return not self.pager._closed def seekable(self) -> bool: """Return whether the buffer is seekable (it's not).""" return False def close(self) -> None: """Close the pager.""" return self.pager.close() @property def closed(self) -> bool: """Return whether the buffer is closed.""" return self.pager.closed @property def mode(self) -> str: """Return the mode.""" return "wb" @property def name(self) -> str: """Return the name.""" return "" def fileno(self) -> int: """Return the file descriptor (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support fileno()") def isatty(self) -> bool: """Return whether the buffer is a TTY.""" return False def read(self, size: int = -1) -> bytes: """Read from the buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support reading") def read1(self, size: int = -1) -> bytes: """Read from the buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support reading") def readinto(self, b: bytearray) -> int: """Read into buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support reading") def readinto1(self, b: bytearray) -> int: """Read into buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support reading") def readline(self, size: int = -1) -> bytes: """Read a line from the buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support reading") def readlines(self, hint: int = -1) -> list[bytes]: """Read lines from the buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support reading") def seek(self, offset: int, whence: int = 0) -> int: """Seek in the buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support seeking") def tell(self) -> int: """Return the current position (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support tell()") def truncate(self, size: int | None = None) -> int: """Truncate the buffer (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support truncation") def __iter__(self) -> "PagerBuffer": """Return iterator (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support iteration") def __next__(self) -> bytes: """Return next line (not supported).""" raise io.UnsupportedOperation("PagerBuffer does not support iteration") def __enter__(self) -> "PagerBuffer": """Enter context manager.""" return self def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: """Exit context manager.""" self.close() class Pager(TextIO): """File-like object that pages output through external pager programs.""" def __init__(self, pager_cmd: str = "cat") -> None: """Initialize Pager. Args: pager_cmd: Command to use for paging (default: "cat") """ self.pager_process: subprocess.Popen[str] | None = None self._buffer = PagerBuffer(self) self._closed = False self.pager_cmd = pager_cmd self._pager_died = False def _get_pager_command(self) -> str: """Get the pager command to use.""" return self.pager_cmd def _ensure_pager_started(self) -> None: """Start the pager process if not already started.""" if self.pager_process is None and not self._closed: try: pager_cmd = self._get_pager_command() self.pager_process = subprocess.Popen( pager_cmd, shell=True, stdin=subprocess.PIPE, stdout=sys.stdout, stderr=sys.stderr, text=True, ) except (OSError, subprocess.SubprocessError): # Pager failed to start, fall back to direct output self.pager_process = None def write(self, text: str) -> int: """Write text to the pager.""" if self._closed: raise ValueError("I/O operation on closed file") # If pager died (user quit), stop writing output if self._pager_died: return len(text) self._ensure_pager_started() if self.pager_process and self.pager_process.stdin: try: result = self.pager_process.stdin.write(text) assert isinstance(result, int) return result except (OSError, subprocess.SubprocessError, BrokenPipeError): # Pager died (user quit), stop writing output self._pager_died = True return len(text) else: # No pager available, write directly to stdout return sys.stdout.write(text) def flush(self) -> None: """Flush the pager.""" if self._closed or self._pager_died: return if self.pager_process and self.pager_process.stdin: try: self.pager_process.stdin.flush() except (OSError, subprocess.SubprocessError, BrokenPipeError): self._pager_died = True else: sys.stdout.flush() def close(self) -> None: """Close the pager.""" if self._closed: return self._closed = True if self.pager_process: try: if self.pager_process.stdin: self.pager_process.stdin.close() self.pager_process.wait() except (OSError, subprocess.SubprocessError): pass self.pager_process = None def __enter__(self) -> "Pager": """Context manager entry.""" return self def __exit__( self, exc_type: type | None, exc_val: BaseException | None, exc_tb: types.TracebackType | None, ) -> None: """Context manager exit.""" self.close() # Additional file-like methods for compatibility def writelines(self, lines: Iterable[str]) -> None: """Write a list of lines to the pager.""" if self._pager_died: return for line in lines: self.write(line) @property def closed(self) -> bool: """Return whether the pager is closed.""" return self._closed def readable(self) -> bool: """Return whether the pager is readable (it's not).""" return False def writable(self) -> bool: """Return whether the pager is writable.""" return not self._closed def seekable(self) -> bool: """Return whether the pager is seekable (it's not).""" return False @property def buffer(self) -> BinaryIO: """Return the underlying binary buffer.""" return self._buffer @property def encoding(self) -> str: """Return the encoding used.""" return "utf-8" @property def errors(self) -> str | None: """Return the error handling scheme.""" return "replace" def fileno(self) -> int: """Return the file descriptor (not supported).""" raise io.UnsupportedOperation("Pager does not support fileno()") def isatty(self) -> bool: """Return whether the pager is a TTY.""" return False @property def line_buffering(self) -> bool: """Return whether line buffering is enabled.""" return True @property def mode(self) -> str: """Return the mode.""" return "w" @property def name(self) -> str: """Return the name.""" return "" @property def newlines(self) -> str | tuple[str, ...] | None: """Return the newlines mode.""" return None def read(self, size: int = -1) -> str: """Read from the pager (not supported).""" raise io.UnsupportedOperation("Pager does not support reading") def readline(self, size: int = -1) -> str: """Read a line from the pager (not supported).""" raise io.UnsupportedOperation("Pager does not support reading") def readlines(self, hint: int = -1) -> list[str]: """Read lines from the pager (not supported).""" raise io.UnsupportedOperation("Pager does not support reading") def seek(self, offset: int, whence: int = 0) -> int: """Seek in the pager (not supported).""" raise io.UnsupportedOperation("Pager does not support seeking") def tell(self) -> int: """Return the current position (not supported).""" raise io.UnsupportedOperation("Pager does not support tell()") def truncate(self, size: int | None = None) -> int: """Truncate the pager (not supported).""" raise io.UnsupportedOperation("Pager does not support truncation") def __iter__(self) -> "Pager": """Return iterator (not supported).""" raise io.UnsupportedOperation("Pager does not support iteration") def __next__(self) -> str: """Return next line (not supported).""" raise io.UnsupportedOperation("Pager does not support iteration") class _StreamContextAdapter: """Adapter to make streams work with context manager protocol.""" def __init__(self, stream: TextIO | BinaryIO) -> None: self.stream = stream # Expose buffer if it exists if hasattr(stream, "buffer"): self.buffer = stream.buffer else: self.buffer = stream def __enter__(self) -> TextIO: # We only use this with sys.stdout which is TextIO return self.stream # type: ignore[return-value] def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: # For stdout/stderr, we don't close them pass def __getattr__(self, name: str) -> object: return getattr(self.stream, name) def get_pager( config: Config | None = None, cmd_name: str | None = None ) -> "_StreamContextAdapter | Pager": """Get a pager instance if paging should be used, otherwise return sys.stdout. Args: config: Optional config instance (e.g., StackedConfig) to read settings from cmd_name: Optional command name for per-command pager settings Returns: Either a wrapped sys.stdout or a Pager instance (both context managers) """ # Check global pager disable flag if getattr(get_pager, "_disabled", False): return _StreamContextAdapter(sys.stdout) # Don't page if stdout is not a terminal if not sys.stdout.isatty(): return _StreamContextAdapter(sys.stdout) # Priority order for pager command (following git's behavior): # 1. Check pager. config (if cmd_name provided) # 2. Check environment variables: DULWICH_PAGER, GIT_PAGER, PAGER # 3. Check core.pager config # 4. Fallback to common pagers pager_cmd = None # 1. Check per-command pager config (pager.) if config and cmd_name: try: pager_value = config.get( ("pager",), cmd_name.encode() if isinstance(cmd_name, str) else cmd_name ) except KeyError: pass else: if pager_value == b"false": return _StreamContextAdapter(sys.stdout) elif pager_value != b"true": # It's a custom pager command pager_cmd = ( pager_value.decode() if isinstance(pager_value, bytes) else pager_value ) # 2. Check environment variables if not pager_cmd: for env_var in ["DULWICH_PAGER", "GIT_PAGER", "PAGER"]: pager = os.environ.get(env_var) if pager: if pager == "false": return _StreamContextAdapter(sys.stdout) pager_cmd = pager break # 3. Check core.pager config if not pager_cmd and config: try: core_pager = config.get(("core",), b"pager") except KeyError: pass else: if core_pager == b"false" or core_pager == b"": return _StreamContextAdapter(sys.stdout) pager_cmd = ( core_pager.decode() if isinstance(core_pager, bytes) else core_pager ) # 4. Fallback to common pagers if not pager_cmd: for pager in ["less", "more", "cat"]: if shutil.which(pager): if pager == "less": pager_cmd = "less -FRX" # -F: quit if one screen, -R: raw control chars, -X: no init/deinit else: pager_cmd = pager break else: pager_cmd = "cat" # Ultimate fallback return Pager(pager_cmd) def disable_pager() -> None: """Disable pager for this session.""" get_pager._disabled = True # type: ignore[attr-defined] def enable_pager() -> None: """Enable pager for this session.""" get_pager._disabled = False # type: ignore[attr-defined] class Command: """A Dulwich subcommand.""" def run(self, args: Sequence[str]) -> int | None: """Run the command.""" raise NotImplementedError(self.run) class cmd_archive(Command): """Create an archive of files from a named tree.""" def run(self, args: Sequence[str]) -> None: """Execute the archive command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--remote", type=str, help="Retrieve archive from specified remote repo", ) parser.add_argument("committish", type=str, nargs="?") parsed_args = parser.parse_args(args) if parsed_args.remote: client, path = get_transport_and_path(parsed_args.remote) def stdout_write(data: bytes) -> None: sys.stdout.buffer.write(data) def stderr_write(data: bytes) -> None: sys.stderr.buffer.write(data) client.archive( path.encode("utf-8") if isinstance(path, str) else path, parsed_args.committish.encode("utf-8") if isinstance(parsed_args.committish, str) else parsed_args.committish, stdout_write, write_error=stderr_write, ) else: # Use binary buffer for archive output outstream: BinaryIO = sys.stdout.buffer errstream: BinaryIO = sys.stderr.buffer porcelain.archive( ".", parsed_args.committish, outstream=outstream, errstream=errstream, ) class cmd_add(Command): """Add file contents to the index.""" def run(self, argv: Sequence[str]) -> None: """Execute the add command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("path", nargs="+") args = parser.parse_args(argv) # Convert '.' to None to add all files paths = args.path if len(paths) == 1 and paths[0] == ".": paths = None porcelain.add(".", paths=paths) class cmd_annotate(Command): """Annotate each line in a file with commit information.""" def run(self, argv: Sequence[str]) -> None: """Execute the annotate command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("path", help="Path to file to annotate") parser.add_argument("committish", nargs="?", help="Commit to start from") args = parser.parse_args(argv) with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="annotate") as outstream: results = porcelain.annotate(repo, args.path, args.committish) for (commit, entry), line in results: # Show shortened commit hash and line content commit_hash = commit.id[:8] outstream.write(f"{commit_hash.decode()} {line.decode()}\n") class cmd_blame(Command): """Show what revision and author last modified each line of a file.""" def run(self, argv: Sequence[str]) -> None: """Execute the blame command. Args: argv: Command line arguments """ # blame is an alias for annotate cmd_annotate().run(argv) class cmd_rm(Command): """Remove files from the working tree and from the index.""" def run(self, argv: Sequence[str]) -> None: """Execute the rm command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--cached", action="store_true", help="Remove from index only" ) parser.add_argument("path", type=Path, nargs="+") args = parser.parse_args(argv) porcelain.remove(".", paths=args.path, cached=args.cached) class cmd_mv(Command): """Move or rename a file, a directory, or a symlink.""" def run(self, argv: Sequence[str]) -> None: """Execute the mv command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "-f", "--force", action="store_true", help="Force move even if destination exists", ) parser.add_argument("source", type=Path) parser.add_argument("destination", type=Path) args = parser.parse_args(argv) porcelain.mv(".", args.source, args.destination, force=args.force) class cmd_fetch_pack(Command): """Receive missing objects from another repository.""" def run(self, argv: Sequence[str]) -> None: """Execute the fetch-pack command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("--all", action="store_true") parser.add_argument("location", nargs="?", type=str) parser.add_argument("refs", nargs="*", type=str) args = parser.parse_args(argv) client, path = get_transport_and_path(args.location) r = Repo(".") if args.all: determine_wants = r.object_store.determine_wants_all else: def determine_wants( refs: Mapping[Ref, ObjectID], depth: int | None = None ) -> list[ObjectID]: return [ ObjectID(y.encode("utf-8")) for y in args.refs if y not in r.object_store ] client.fetch(path.encode("utf-8"), r, determine_wants) class cmd_fetch(Command): """Download objects and refs from another repository.""" def run(self, args: Sequence[str]) -> None: """Execute the fetch command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() # Mutually exclusive group for location vs --all location_group = parser.add_mutually_exclusive_group(required=True) location_group.add_argument( "location", nargs="?", default=None, help="Remote location to fetch from" ) location_group.add_argument( "--all", action="store_true", help="Fetch all remotes" ) # Mutually exclusive group for tag handling tag_group = parser.add_mutually_exclusive_group() tag_group.add_argument( "--tags", action="store_true", help="Fetch all tags from remote" ) tag_group.add_argument( "--no-tags", action="store_true", help="Don't fetch any tags from remote" ) parser.add_argument( "--depth", type=int, help="Create a shallow clone with a history truncated to the specified number of commits", ) parser.add_argument( "--shallow-since", type=str, help="Deepen or shorten the history of a shallow repository to include all reachable commits after ", ) parser.add_argument( "--shallow-exclude", type=str, action="append", help="Deepen or shorten the history of a shallow repository to exclude commits reachable from a specified remote branch or tag", ) parsed_args = parser.parse_args(args) r = Repo(".") def progress(msg: bytes) -> None: sys.stdout.buffer.write(msg) # Determine include_tags setting include_tags = False if parsed_args.tags: include_tags = True elif not parsed_args.no_tags: # Default behavior - don't force tag inclusion include_tags = False if parsed_args.all: # Fetch from all remotes config = r.get_config() remotes = set() for section in config.sections(): if len(section) == 2 and section[0] == b"remote": remotes.add(section[1].decode()) if not remotes: logger.warning("No remotes configured") return for remote_name in sorted(remotes): logger.info("Fetching %s", remote_name) porcelain.fetch( r, remote_location=remote_name, depth=parsed_args.depth, include_tags=include_tags, shallow_since=parsed_args.shallow_since, shallow_exclude=parsed_args.shallow_exclude, ) else: # Fetch from specific location porcelain.fetch( r, remote_location=parsed_args.location, depth=parsed_args.depth, include_tags=include_tags, shallow_since=parsed_args.shallow_since, shallow_exclude=parsed_args.shallow_exclude, ) class cmd_for_each_ref(Command): """Output information on each ref.""" def run(self, args: Sequence[str]) -> None: """Execute the for-each-ref command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("pattern", type=str, nargs="?") parsed_args = parser.parse_args(args) for sha, object_type, ref in porcelain.for_each_ref(".", parsed_args.pattern): logger.info("%s %s\t%s", sha.decode(), object_type.decode(), ref.decode()) class cmd_fsck(Command): """Verify the connectivity and validity of objects in the database.""" def run(self, args: Sequence[str]) -> None: """Execute the fsck command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) for obj, msg in porcelain.fsck("."): logger.info("%s: %s", obj.decode() if isinstance(obj, bytes) else obj, msg) class cmd_log(Command): """Show commit logs.""" def run(self, args: Sequence[str]) -> None: """Execute the log command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--reverse", action="store_true", help="Reverse order in which entries are printed", ) parser.add_argument( "--name-status", action="store_true", help="Print name/status for each changed file", ) parser.add_argument("paths", nargs="*", help="Paths to show log for") parsed_args = parser.parse_args(args) with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="log") as outstream: porcelain.log( repo, paths=parsed_args.paths, reverse=parsed_args.reverse, name_status=parsed_args.name_status, outstream=outstream, ) class cmd_diff(Command): """Show changes between commits, commit and working tree, etc.""" def run(self, args: Sequence[str]) -> None: """Execute the diff command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "committish", nargs="*", default=[], help="Commits or refs to compare" ) parser.add_argument("--staged", action="store_true", help="Show staged changes") parser.add_argument( "--cached", action="store_true", help="Show staged changes (same as --staged)", ) parser.add_argument( "--color", choices=["always", "never", "auto"], default="auto", help="Use colored output (requires rich)", ) parser.add_argument( "--patience", action="store_true", help="Use patience diff algorithm", ) parser.add_argument( "--diff-algorithm", choices=["myers", "patience"], default="myers", help="Choose a diff algorithm", ) parser.add_argument( "--stat", action="store_true", help="Show diffstat instead of full diff", ) parser.add_argument( "--", dest="separator", action="store_true", help=argparse.SUPPRESS ) parser.add_argument("paths", nargs="*", default=[], help="Paths to limit diff") # Handle the -- separator for paths if "--" in args: sep_index = args.index("--") parsed_args = parser.parse_args(args[:sep_index]) parsed_args.paths = args[sep_index + 1 :] else: parsed_args = parser.parse_args(args) # Determine diff algorithm diff_algorithm = parsed_args.diff_algorithm if parsed_args.patience: diff_algorithm = "patience" # Determine if we should use color def _should_use_color() -> bool: if parsed_args.color == "always": return True elif parsed_args.color == "never": return False else: # auto return sys.stdout.isatty() def _create_output_stream(outstream: TextIO) -> BinaryIO: """Create output stream, optionally with colorization.""" if not _should_use_color(): return outstream.buffer from .diff import ColorizedDiffStream if not ColorizedDiffStream.is_available(): if parsed_args.color == "always": raise ImportError( "Rich is required for colored output. Install with: pip install 'dulwich[colordiff]'" ) else: logging.warning( "Rich not available, disabling colored output. Install with: pip install 'dulwich[colordiff]'" ) return outstream.buffer return ColorizedDiffStream(outstream.buffer) with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="diff") as outstream: # For --stat mode, capture the diff in a BytesIO buffer if parsed_args.stat: import io from .diffstat import diffstat diff_buffer: BinaryIO = io.BytesIO() output_stream: BinaryIO = diff_buffer else: output_stream = _create_output_stream(outstream) try: if len(parsed_args.committish) == 0: # Show diff for working tree or staged changes porcelain.diff( repo, staged=(parsed_args.staged or parsed_args.cached), paths=parsed_args.paths or None, outstream=output_stream, diff_algorithm=diff_algorithm, ) elif len(parsed_args.committish) == 1: # Show diff between working tree and specified commit if parsed_args.staged or parsed_args.cached: parser.error( "--staged/--cached cannot be used with commits" ) porcelain.diff( repo, commit=parsed_args.committish[0], staged=False, paths=parsed_args.paths or None, outstream=output_stream, diff_algorithm=diff_algorithm, ) elif len(parsed_args.committish) == 2: # Show diff between two commits porcelain.diff( repo, commit=parsed_args.committish[0], commit2=parsed_args.committish[1], paths=parsed_args.paths or None, outstream=output_stream, diff_algorithm=diff_algorithm, ) else: parser.error("Too many arguments - specify at most two commits") except DiffAlgorithmNotAvailable as e: sys.stderr.write(f"fatal: {e}\n") sys.exit(1) if parsed_args.stat: # Generate and output diffstat from captured diff assert isinstance(diff_buffer, io.BytesIO) diff_data = diff_buffer.getvalue() lines = diff_data.split(b"\n") stat_output = diffstat(lines) outstream.buffer.write(stat_output + b"\n") else: # Flush any remaining output if hasattr(output_stream, "flush"): output_stream.flush() class cmd_dump_pack(Command): """Dump the contents of a pack file for debugging.""" def run(self, args: Sequence[str]) -> None: """Execute the dump-pack command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("filename", help="Pack file to dump") parser.add_argument( "--object-format", choices=["sha1", "sha256"], default="sha1", help="Object format (hash algorithm) used in the pack file", ) parsed_args = parser.parse_args(args) from .object_format import OBJECT_FORMATS object_format = OBJECT_FORMATS[parsed_args.object_format] basename, _ = os.path.splitext(parsed_args.filename) x = Pack(basename, object_format=object_format) logger.info("Object names checksum: %s", x.name().decode("ascii", "replace")) logger.info("Checksum: %r", sha_to_hex(RawObjectID(x.get_stored_checksum()))) x.check() logger.info("Length: %d", len(x)) for name in x: try: logger.info("\t%s", x[name]) except KeyError as k: logger.error( "\t%s: Unable to resolve base %r", name.decode("ascii", "replace"), k, ) except ApplyDeltaError as e: logger.error( "\t%s: Unable to apply delta: %r", name.decode("ascii", "replace"), e, ) class cmd_dump_index(Command): """Show information about a pack index file.""" def run(self, args: Sequence[str]) -> None: """Execute the dump-index command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("filename", help="Index file to dump") parsed_args = parser.parse_args(args) idx = Index(parsed_args.filename) for o in idx: logger.info("%s %s", o, idx[o]) class cmd_interpret_trailers(Command): """Add or parse structured information in commit messages.""" def run(self, args: Sequence[str]) -> None: """Execute the interpret-trailers command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "file", nargs="?", help="File to read message from. If not specified, reads from stdin.", ) parser.add_argument( "--trailer", action="append", dest="trailers", metavar="[(=|:)]", help="Trailer to add. Can be specified multiple times.", ) parser.add_argument( "--trim-empty", action="store_true", help="Remove trailers with empty values", ) parser.add_argument( "--only-trailers", action="store_true", help="Output only the trailers, not the message body", ) parser.add_argument( "--only-input", action="store_true", help="Don't add new trailers, only parse existing ones", ) parser.add_argument( "--unfold", action="store_true", help="Join multiline values into one line" ) parser.add_argument( "--parse", action="store_true", help="Shorthand for --only-trailers --only-input --unfold", ) parser.add_argument( "--where", choices=["end", "start", "after", "before"], default="end", help="Where to place new trailers", ) parser.add_argument( "--if-exists", choices=[ "add", "replace", "addIfDifferent", "addIfDifferentNeighbor", "doNothing", ], default="addIfDifferentNeighbor", help="Action if trailer already exists", ) parser.add_argument( "--if-missing", choices=["add", "doNothing"], default="add", help="Action if trailer is missing", ) parsed_args = parser.parse_args(args) # Read message from file or stdin if parsed_args.file: with open(parsed_args.file, "rb") as f: message = f.read() else: message = sys.stdin.buffer.read() # Parse trailer arguments trailer_list = [] if parsed_args.trailers: for trailer_spec in parsed_args.trailers: # Parse "key:value" or "key=value" or just "key" if ":" in trailer_spec: key, value = trailer_spec.split(":", 1) elif "=" in trailer_spec: key, value = trailer_spec.split("=", 1) else: key = trailer_spec value = "" trailer_list.append((key.strip(), value.strip())) # Call interpret_trailers result = porcelain.interpret_trailers( message, trailers=trailer_list if trailer_list else None, trim_empty=parsed_args.trim_empty, only_trailers=parsed_args.only_trailers, only_input=parsed_args.only_input, unfold=parsed_args.unfold, parse=parsed_args.parse, where=parsed_args.where, if_exists=parsed_args.if_exists, if_missing=parsed_args.if_missing, ) # Output result sys.stdout.buffer.write(result) class cmd_stripspace(Command): """Remove unnecessary whitespace from text.""" def run(self, args: Sequence[str]) -> None: """Execute the stripspace command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "file", nargs="?", help="File to read text from. If not specified, reads from stdin.", ) parser.add_argument( "-s", "--strip-comments", action="store_true", help="Strip lines that begin with comment character", ) parser.add_argument( "-c", "--comment-lines", action="store_true", help="Prepend comment character to each line", ) parser.add_argument( "--comment-char", default="#", help="Comment character to use (default: #)", ) parsed_args = parser.parse_args(args) # Read text from file or stdin if parsed_args.file: with open(parsed_args.file, "rb") as f: text = f.read() else: text = sys.stdin.buffer.read() # Call stripspace result = porcelain.stripspace( text, strip_comments=parsed_args.strip_comments, comment_char=parsed_args.comment_char, comment_lines=parsed_args.comment_lines, ) # Output result sys.stdout.buffer.write(result) class cmd_column(Command): """Display data in columns.""" def run(self, args: Sequence[str]) -> None: """Execute the column command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Format input data into columns for better readability" ) parser.add_argument( "--mode", default="column", help=( "Layout mode: 'column' (fill columns first), 'row' (fill rows first), " "'plain' (one column). Add ',dense' for unequal column widths, " "',nodense' for equal widths (default: column)" ), ) parser.add_argument( "--width", type=int, help="Terminal width (default: auto-detect)", ) parser.add_argument( "--indent", default="", help="String to prepend to each line (default: empty)", ) parser.add_argument( "--nl", default="\n", help="String to append to each line, including newline (default: \\n)", ) parser.add_argument( "--padding", type=int, default=1, help="Number of spaces between columns (default: 1)", ) parsed_args = parser.parse_args(args) # Read lines from stdin lines = [] for line in sys.stdin: # Strip the newline but keep the content lines.append(line.rstrip("\n\r")) # Format and output result = format_columns( lines, width=parsed_args.width, mode=parsed_args.mode, padding=parsed_args.padding, indent=parsed_args.indent, nl=parsed_args.nl, ) sys.stdout.write(result) class cmd_init(Command): """Create an empty Git repository or reinitialize an existing one.""" def run(self, args: Sequence[str]) -> None: """Execute the init command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--bare", action="store_true", help="Create a bare repository" ) parser.add_argument( "--objectformat", type=str, choices=["sha1", "sha256"], help="Object format to use (sha1 or sha256)", ) parser.add_argument( "path", nargs="?", default=os.getcwd(), help="Repository path" ) parsed_args = parser.parse_args(args) porcelain.init( parsed_args.path, bare=parsed_args.bare, object_format=parsed_args.objectformat, ) class cmd_clone(Command): """Clone a repository into a new directory.""" def run(self, args: Sequence[str]) -> None: """Execute the clone command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--bare", help="Whether to create a bare repository.", action="store_true", ) parser.add_argument("--depth", type=int, help="Depth at which to fetch") parser.add_argument( "-b", "--branch", type=str, help="Check out branch instead of branch pointed to by remote HEAD", ) parser.add_argument( "--refspec", type=str, help="References to fetch", action="append", ) parser.add_argument( "--filter", dest="filter_spec", type=str, help="git-rev-list-style object filter", ) parser.add_argument( "--protocol", type=int, help="Git protocol version to use", ) parser.add_argument( "--recurse-submodules", action="store_true", help="Initialize and clone submodules", ) parser.add_argument("source", help="Repository to clone from") parser.add_argument("target", nargs="?", help="Directory to clone into") parsed_args = parser.parse_args(args) try: porcelain.clone( parsed_args.source, parsed_args.target, bare=parsed_args.bare, depth=parsed_args.depth, branch=parsed_args.branch, refspec=parsed_args.refspec, filter_spec=parsed_args.filter_spec, protocol_version=parsed_args.protocol, recurse_submodules=parsed_args.recurse_submodules, ) except GitProtocolError as e: logging.exception(e) def _get_commit_message_with_template( initial_message: bytes | None, repo: Repo | None = None, commit: Commit | None = None, ) -> bytes: """Get commit message with an initial message template.""" # Start with the initial message template = initial_message or b"" if template and not template.endswith(b"\n"): template += b"\n" template += b"\n" template += b"# Please enter the commit message for your changes. Lines starting\n" template += b"# with '#' will be ignored, and an empty message aborts the commit.\n" template += b"#\n" # Add branch info if repo is provided if repo: try: ref_names, _ref_sha = repo.refs.follow(HEADREF) ref_path = ref_names[-1] # Get the final reference if ref_path.startswith(b"refs/heads/"): branch = ref_path[11:] # Remove 'refs/heads/' prefix else: branch = ref_path template += b"# On branch %s\n" % branch except (KeyError, IndexError): template += b"# On branch (unknown)\n" template += b"#\n" template += b"# Changes to be committed:\n" # Launch editor content = launch_editor(template) # Remove comment lines and strip lines = content.split(b"\n") message_lines = [line for line in lines if not line.strip().startswith(b"#")] message = b"\n".join(message_lines).strip() if not message: raise CommitMessageError("Aborting commit due to empty commit message") return message class cmd_config(Command): """Get and set repository or global options.""" def run(self, args: Sequence[str]) -> int | None: """Execute the config command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--global", dest="global_config", action="store_true", help="Use global config file", ) parser.add_argument( "--local", action="store_true", help="Use repository config file (default)", ) parser.add_argument( "-l", "--list", action="store_true", help="List all variables", ) parser.add_argument( "--unset", action="store_true", help="Remove a variable", ) parser.add_argument( "--unset-all", action="store_true", help="Remove all matches for a variable", ) parser.add_argument( "--get-all", action="store_true", help="Get all values for a multivar", ) parser.add_argument( "key", nargs="?", help="Config key (e.g., user.name)", ) parser.add_argument( "value", nargs="?", help="Config value to set", ) parsed_args = parser.parse_args(args) # Determine which config file to use if parsed_args.global_config: # Use global config file config_path = os.path.expanduser("~/.gitconfig") try: from .config import ConfigFile config = ConfigFile.from_path(config_path) except FileNotFoundError: from .config import ConfigFile config = ConfigFile() config.path = config_path else: # Use local repository config (default) try: repo = Repo(".") config = repo.get_config() except NotGitRepository: logger.error("error: not a git repository") return 1 # Handle --list if parsed_args.list: for section in config.sections(): for key, value in config.items(section): section_str = ".".join( s.decode("utf-8") if isinstance(s, bytes) else s for s in section ) key_str = key.decode("utf-8") if isinstance(key, bytes) else key value_str = ( value.decode("utf-8") if isinstance(value, bytes) else value ) print(f"{section_str}.{key_str}={value_str}") return 0 # Handle --unset or --unset-all if parsed_args.unset or parsed_args.unset_all: if not parsed_args.key: logger.error("error: key is required for --unset") return 1 # Parse the key (e.g., "user.name" or "remote.origin.url") parts = parsed_args.key.split(".") if len(parts) < 2: logger.error("error: invalid key format") return 1 if len(parts) == 2: section = (parts[0],) name = parts[1] else: # For keys like "remote.origin.url", section is ("remote", "origin") section = tuple(parts[:-1]) name = parts[-1] try: # Check if the key exists first try: config.get(section, name) except KeyError: logger.error(f"error: key '{parsed_args.key}' not found") return 1 # Delete the configuration key using ConfigDict's delete method section_bytes = tuple( s.encode("utf-8") if isinstance(s, str) else s for s in section ) name_bytes = name.encode("utf-8") if isinstance(name, str) else name section_dict = config._values.get(section_bytes) if section_dict: del section_dict[name_bytes] config.write_to_path() else: logger.error(f"error: key '{parsed_args.key}' not found") return 1 except Exception as e: logger.error(f"error: {e}") return 1 return 0 # Handle --get-all if parsed_args.get_all: if not parsed_args.key: logger.error("error: key is required for --get-all") return 1 parts = parsed_args.key.split(".") if len(parts) < 2: logger.error("error: invalid key format") return 1 if len(parts) == 2: section = (parts[0],) name = parts[1] else: section = tuple(parts[:-1]) name = parts[-1] try: for value in config.get_multivar(section, name): value_str = ( value.decode("utf-8") if isinstance(value, bytes) else value ) print(value_str) return 0 except KeyError: return 1 # Handle get (no value provided) if parsed_args.key and not parsed_args.value: parts = parsed_args.key.split(".") if len(parts) < 2: logger.error("error: invalid key format") return 1 if len(parts) == 2: section = (parts[0],) name = parts[1] else: # For keys like "remote.origin.url", section is ("remote", "origin") section = tuple(parts[:-1]) name = parts[-1] try: value = config.get(section, name) value_str = value.decode("utf-8") if isinstance(value, bytes) else value print(value_str) return 0 except KeyError: return 1 # Handle set (key and value provided) if parsed_args.key and parsed_args.value: parts = parsed_args.key.split(".") if len(parts) < 2: logger.error("error: invalid key format") return 1 if len(parts) == 2: section = (parts[0],) name = parts[1] else: # For keys like "remote.origin.url", section is ("remote", "origin") section = tuple(parts[:-1]) name = parts[-1] config.set(section, name, parsed_args.value) if parsed_args.global_config: config.write_to_path() else: config.write_to_path() return 0 # No action specified parser.print_help() return 1 class cmd_commit(Command): """Record changes to the repository.""" def run(self, args: Sequence[str]) -> int | None: """Execute the commit command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("--message", "-m", help="Commit message") parser.add_argument( "-a", "--all", action="store_true", help="Automatically stage all tracked files that have been modified", ) parser.add_argument( "--amend", action="store_true", help="Replace the tip of the current branch by creating a new commit", ) parsed_args = parser.parse_args(args) message: bytes | str | Callable[[Repo | None, Commit | None], bytes] if parsed_args.message: message = parsed_args.message elif parsed_args.amend: # For amend, create a callable that opens editor with original message pre-populated def get_amend_message(repo: Repo | None, commit: Commit | None) -> bytes: # Get the original commit message from current HEAD assert repo is not None try: head_commit = repo[repo.head()] assert isinstance(head_commit, Commit) original_message = head_commit.message except KeyError: original_message = b"" # Open editor with original message return _get_commit_message_with_template(original_message, repo, commit) message = get_amend_message else: # For regular commits, use empty template def get_regular_message(repo: Repo | None, commit: Commit | None) -> bytes: return _get_commit_message_with_template(b"", repo, commit) message = get_regular_message try: porcelain.commit( ".", message=message, all=parsed_args.all, amend=parsed_args.amend ) except CommitMessageError as e: logging.exception(e) return 1 return None class cmd_commit_tree(Command): """Create a new commit object from a tree.""" def run(self, args: Sequence[str]) -> None: """Execute the commit-tree command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("--message", "-m", required=True, help="Commit message") parser.add_argument("tree", help="Tree SHA to commit") parsed_args = parser.parse_args(args) porcelain.commit_tree(".", tree=parsed_args.tree, message=parsed_args.message) class cmd_update_server_info(Command): """Update auxiliary info file to help dumb servers.""" def run(self, args: Sequence[str]) -> None: """Execute the update-server-info command. Args: args: Command line arguments """ porcelain.update_server_info(".") class cmd_symbolic_ref(Command): """Read, modify and delete symbolic refs.""" def run(self, args: Sequence[str]) -> int | None: """Execute the symbolic-ref command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("name", help="Symbolic reference name") parser.add_argument("ref", nargs="?", help="Target reference") parser.add_argument("--force", action="store_true", help="Force update") parsed_args = parser.parse_args(args) # If ref is provided, we're setting; otherwise we're reading if parsed_args.ref: # Set symbolic reference from .repo import Repo with Repo(".") as repo: repo.refs.set_symbolic_ref( parsed_args.name.encode(), parsed_args.ref.encode() ) return 0 else: # Read symbolic reference from .repo import Repo with Repo(".") as repo: try: target = repo.refs.read_ref(parsed_args.name.encode()) if target is None: logger.error( "fatal: ref '%s' is not a symbolic ref", parsed_args.name ) return 1 elif target.startswith(b"ref: "): logger.info(target[5:].decode()) else: logger.info(target.decode()) return 0 except KeyError: logger.error( "fatal: ref '%s' is not a symbolic ref", parsed_args.name ) return 1 class cmd_pack_refs(Command): """Pack heads and tags for efficient repository access.""" def run(self, argv: Sequence[str]) -> None: """Execute the pack-refs command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("--all", action="store_true") # ignored, we never prune parser.add_argument("--no-prune", action="store_true") args = parser.parse_args(argv) porcelain.pack_refs(".", all=args.all) class cmd_var(Command): """Display Git logical variables.""" def run(self, argv: Sequence[str]) -> int | None: """Execute the var command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "variable", nargs="?", help="Variable to query (e.g., GIT_AUTHOR_IDENT)", ) parser.add_argument( "-l", "--list", action="store_true", help="List all variables", ) args = parser.parse_args(argv) if args.list: # List all variables variables = porcelain.var_list(".") for key, value in sorted(variables.items()): print(f"{key}={value}") return 0 elif args.variable: # Query specific variable try: value = porcelain.var(".", variable=args.variable) print(value) return 0 except KeyError: logger.error("error: variable '%s' has no value", args.variable) return 1 else: # No arguments - print error logger.error("error: variable or -l is required") parser.print_help() return 1 class cmd_show(Command): """Show various types of objects.""" def run(self, argv: Sequence[str]) -> None: """Execute the show command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("objectish", type=str, nargs="*") parser.add_argument( "--color", choices=["always", "never", "auto"], default="auto", help="Use colored output (requires rich)", ) args = parser.parse_args(argv) # Determine if we should use color def _should_use_color() -> bool: if args.color == "always": return True elif args.color == "never": return False else: # auto return sys.stdout.isatty() def _create_output_stream(outstream: TextIO) -> TextIO: """Create output stream, optionally with colorization.""" if not _should_use_color(): return outstream from .diff import ColorizedDiffStream if not ColorizedDiffStream.is_available(): if args.color == "always": raise ImportError( "Rich is required for colored output. Install with: pip install 'dulwich[colordiff]'" ) else: logging.warning( "Rich not available, disabling colored output. Install with: pip install 'dulwich[colordiff]'" ) return outstream # Wrap the ColorizedDiffStream (BinaryIO) back to TextIO import io colorized = ColorizedDiffStream(outstream.buffer) return io.TextIOWrapper(colorized, encoding="utf-8", line_buffering=True) with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="show") as outstream: output_stream = _create_output_stream(outstream) porcelain.show(repo, args.objectish or None, outstream=output_stream) class cmd_show_ref(Command): """List references in a local repository.""" def run(self, args: Sequence[str]) -> int | None: """Execute the show-ref command. Args: args: Command line arguments Returns: Exit code (0 for success, 1 for error/no matches, 2 for missing ref with --exists) """ parser = argparse.ArgumentParser() parser.add_argument( "--head", action="store_true", help="Show the HEAD reference", ) parser.add_argument( "--branches", action="store_true", help="Limit to local branches", ) parser.add_argument( "--tags", action="store_true", help="Limit to local tags", ) parser.add_argument( "-d", "--dereference", action="store_true", help="Dereference tags into object IDs", ) parser.add_argument( "-s", "--hash", nargs="?", const=40, # TODO: Support SHA256 type=int, metavar="n", help="Only show the OID, not the reference name", ) parser.add_argument( "--abbrev", nargs="?", const=7, type=int, metavar="n", help="Abbreviate the object name", ) parser.add_argument( "--verify", action="store_true", help="Enable stricter reference checking (exact path match)", ) parser.add_argument( "--exists", action="store_true", help="Check whether the given reference exists", ) parser.add_argument( "-q", "--quiet", action="store_true", help="Do not print any results to stdout", ) parser.add_argument( "patterns", nargs="*", help="Show references matching patterns", ) parsed_args = parser.parse_args(args) # Handle --exists mode if parsed_args.exists: if not parsed_args.patterns or len(parsed_args.patterns) != 1: logger.error("--exists requires exactly one reference argument") return 1 try: with Repo(".") as repo: repo_refs = repo.get_refs() pattern_bytes = os.fsencode(parsed_args.patterns[0]) if pattern_bytes in repo_refs: return 0 # Reference exists else: return 2 # Reference missing except (NotGitRepository, OSError, FileFormatException) as e: logger.error(f"Error looking up reference: {e}") return 1 # Error looking up reference # Regular show-ref mode try: matched_refs = porcelain.show_ref( ".", patterns=parsed_args.patterns if parsed_args.patterns else None, head=parsed_args.head, branches=parsed_args.branches, tags=parsed_args.tags, dereference=parsed_args.dereference, verify=parsed_args.verify, ) except (NotGitRepository, OSError, FileFormatException) as e: logger.error(f"Error: {e}") return 1 # Return error if no matches found (unless quiet) if not matched_refs: if parsed_args.verify and not parsed_args.quiet: logger.error("error: no matching refs found") return 1 # Output results if not parsed_args.quiet: # TODO: Add support for SHA256 abbrev_len = parsed_args.abbrev if parsed_args.abbrev else 40 hash_only = parsed_args.hash is not None if hash_only and parsed_args.hash: abbrev_len = parsed_args.hash for sha, ref in matched_refs: sha_str = sha.decode() if abbrev_len < 40: sha_str = sha_str[:abbrev_len] if hash_only: logger.info(sha_str) else: logger.info(f"{sha_str} {ref.decode()}") return 0 class cmd_show_branch(Command): """Show branches and their commits.""" def run(self, args: Sequence[str]) -> int | None: """Execute the show-branch command. Args: args: Command line arguments Returns: Exit code (0 for success, 1 for error) """ parser = argparse.ArgumentParser() parser.add_argument( "-r", "--remotes", action="store_true", help="Show remote-tracking branches", ) parser.add_argument( "-a", "--all", dest="all_branches", action="store_true", help="Show both remote-tracking and local branches", ) parser.add_argument( "--current", action="store_true", help="Include current branch if not given on command line", ) parser.add_argument( "--topo-order", dest="topo_order", action="store_true", help="Show commits in topological order", ) parser.add_argument( "--date-order", action="store_true", help="Show commits in date order (default)", ) parser.add_argument( "--more", type=int, metavar="n", help="Show n more commits beyond common ancestor", ) parser.add_argument( "--list", dest="list_branches", action="store_true", help="Show only branch names and their tip commits", ) parser.add_argument( "--independent", dest="independent_branches", action="store_true", help="Show only branches not reachable from any other", ) parser.add_argument( "--merge-base", dest="merge_base", action="store_true", help="Show merge base of specified branches", ) parser.add_argument( "branches", nargs="*", help="Branches to show (default: all local branches)", ) parsed_args = parser.parse_args(args) try: output_lines = porcelain.show_branch( ".", branches=parsed_args.branches if parsed_args.branches else None, all_branches=parsed_args.all_branches, remotes=parsed_args.remotes, current=parsed_args.current, topo_order=parsed_args.topo_order, more=parsed_args.more, list_branches=parsed_args.list_branches, independent_branches=parsed_args.independent_branches, merge_base=parsed_args.merge_base, ) except (NotGitRepository, OSError, FileFormatException) as e: logger.error(f"Error: {e}") return 1 # Output results for line in output_lines: logger.info(line) return 0 class cmd_diff_tree(Command): """Compare the content and mode of trees.""" def run(self, args: Sequence[str]) -> None: """Execute the diff-tree command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("old_tree", help="Old tree SHA") parser.add_argument("new_tree", help="New tree SHA") parsed_args = parser.parse_args(args) porcelain.diff_tree(".", parsed_args.old_tree, parsed_args.new_tree) class cmd_rev_list(Command): """List commit objects in reverse chronological order.""" def run(self, args: Sequence[str]) -> None: """Execute the rev-list command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("commits", nargs="+", help="Commit IDs to list") parsed_args = parser.parse_args(args) porcelain.rev_list(".", parsed_args.commits) class cmd_tag(Command): """Create, list, delete or verify a tag object.""" def run(self, args: Sequence[str]) -> None: """Execute the tag command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "-a", "--annotated", help="Create an annotated tag.", action="store_true", ) parser.add_argument( "-s", "--sign", help="Sign the annotated tag.", action="store_true" ) parser.add_argument("tag_name", help="Name of the tag to create") parsed_args = parser.parse_args(args) porcelain.tag_create( ".", parsed_args.tag_name, annotated=parsed_args.annotated, sign=parsed_args.sign, ) class cmd_verify_commit(Command): """Check the GPG signature of commits.""" def run(self, args: Sequence[str]) -> int | None: """Execute the verify-commit command. Args: args: Command line arguments Returns: Exit code (1 on error, None on success) """ parser = argparse.ArgumentParser() parser.add_argument( "-v", "--verbose", help="Print the contents of the commit object before validating it.", action="store_true", ) parser.add_argument( "--raw", help="Print the raw gpg status output to standard error.", action="store_true", ) parser.add_argument( "commits", nargs="*", default=["HEAD"], help="Commits to verify (defaults to HEAD)", ) parsed_args = parser.parse_args(args) exit_code = None for commit in parsed_args.commits: try: if parsed_args.verbose: # Show commit contents before verification porcelain.show( ".", objects=[commit], outstream=sys.stdout, ) porcelain.verify_commit(".", commit) if not parsed_args.raw: print(f"gpg: Good signature from commit '{commit}'") except Exception as e: if not parsed_args.raw: print(f"error: {commit}: {e}", file=sys.stderr) else: # In raw mode, let the exception propagate raise exit_code = 1 return exit_code class cmd_verify_tag(Command): """Check the GPG signature of tags.""" def run(self, args: Sequence[str]) -> int | None: """Execute the verify-tag command. Args: args: Command line arguments Returns: Exit code (1 on error, None on success) """ parser = argparse.ArgumentParser() parser.add_argument( "-v", "--verbose", help="Print the contents of the tag object before validating it.", action="store_true", ) parser.add_argument( "--raw", help="Print the raw gpg status output to standard error.", action="store_true", ) parser.add_argument("tags", nargs="+", help="Tags to verify") parsed_args = parser.parse_args(args) exit_code = None for tag in parsed_args.tags: try: if parsed_args.verbose: # Show tag contents before verification porcelain.show( ".", objects=[tag], outstream=sys.stdout, ) porcelain.verify_tag(".", tag) if not parsed_args.raw: print(f"gpg: Good signature from tag '{tag}'") except Exception as e: if not parsed_args.raw: print(f"error: {tag}: {e}", file=sys.stderr) else: # In raw mode, let the exception propagate raise exit_code = 1 return exit_code class cmd_repack(Command): """Pack unpacked objects in a repository.""" def run(self, args: Sequence[str]) -> None: """Execute the repack command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--write-bitmap-index", action="store_true", help="write a bitmap index for packs", ) parsed_args = parser.parse_args(args) porcelain.repack(".", write_bitmaps=parsed_args.write_bitmap_index) class cmd_reflog(Command): """Manage reflog information.""" def run(self, args: Sequence[str]) -> None: """Execute the reflog command. Args: args: Command line arguments """ parser = argparse.ArgumentParser(prog="dulwich reflog") subparsers = parser.add_subparsers(dest="subcommand", help="Subcommand") # Show subcommand (default when no subcommand is specified) show_parser = subparsers.add_parser( "show", help="Show reflog entries (default)", add_help=False ) show_parser.add_argument( "ref", nargs="?", default="HEAD", help="Reference to show reflog for" ) show_parser.add_argument( "--all", action="store_true", help="Show reflogs for all refs" ) # Expire subcommand expire_parser = subparsers.add_parser("expire", help="Expire reflog entries") expire_parser.add_argument( "ref", nargs="?", help="Reference to expire reflog for" ) expire_parser.add_argument( "--all", action="store_true", help="Expire reflogs for all refs" ) expire_parser.add_argument( "--expire", type=str, help="Expire entries older than time (e.g., '90 days ago', 'all', 'never')", ) expire_parser.add_argument( "--expire-unreachable", type=str, help="Expire unreachable entries older than time", ) expire_parser.add_argument( "--dry-run", "-n", action="store_true", help="Show what would be expired" ) # Delete subcommand delete_parser = subparsers.add_parser( "delete", help="Delete specific reflog entry" ) delete_parser.add_argument( "refspec", help="Reference specification (e.g., HEAD@{1})" ) delete_parser.add_argument( "--rewrite", action="store_true", help="Rewrite subsequent entries to maintain consistency", ) # If no arguments or first arg is not a subcommand, treat as show if not args or (args[0] not in ["show", "expire", "delete"]): # Parse as show command parsed_args = parser.parse_args(["show", *list(args)]) else: parsed_args = parser.parse_args(args) if parsed_args.subcommand == "expire": self._run_expire(parsed_args) elif parsed_args.subcommand == "delete": self._run_delete(parsed_args) else: # show or default self._run_show(parsed_args) def _run_show(self, parsed_args: argparse.Namespace) -> None: """Show reflog entries.""" with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="reflog") as outstream: if parsed_args.all: # Show reflogs for all refs for ref_bytes, entry in porcelain.reflog(repo, all=True): ref_str = ref_bytes.decode("utf-8", "replace") short_new = entry.new_sha[:8].decode("ascii") outstream.write( f"{short_new} {ref_str}: {entry.message.decode('utf-8', 'replace')}\n" ) else: ref = ( parsed_args.ref.encode("utf-8") if isinstance(parsed_args.ref, str) else parsed_args.ref ) for i, entry in enumerate(porcelain.reflog(repo, ref)): # Format similar to git reflog from dulwich.reflog import Entry assert isinstance(entry, Entry) short_new = entry.new_sha[:8].decode("ascii") message = ( entry.message.decode("utf-8", "replace") if entry.message else "" ) outstream.write( f"{short_new} {ref.decode('utf-8', 'replace')}@{{{i}}}: {message}\n" ) def _run_expire(self, parsed_args: argparse.Namespace) -> None: """Expire reflog entries.""" # Parse time specifications expire_time = None expire_unreachable_time = None if parsed_args.expire: expire_time = parse_time_to_timestamp(parsed_args.expire) if parsed_args.expire_unreachable: expire_unreachable_time = parse_time_to_timestamp( parsed_args.expire_unreachable ) # Execute expire result = porcelain.reflog_expire( repo=".", ref=parsed_args.ref, all=parsed_args.all, expire_time=expire_time, expire_unreachable_time=expire_unreachable_time, dry_run=parsed_args.dry_run, ) # Print results for ref_name, count in result.items(): ref_str = ref_name.decode("utf-8", "replace") if parsed_args.dry_run: print(f"Would expire {count} entries from {ref_str}") else: print(f"Expired {count} entries from {ref_str}") def _run_delete(self, parsed_args: argparse.Namespace) -> None: """Delete a specific reflog entry.""" from dulwich.reflog import parse_reflog_spec # Parse refspec (e.g., "HEAD@{1}" or "refs/heads/master@{2}") ref, index = parse_reflog_spec(parsed_args.refspec) # Execute delete porcelain.reflog_delete( repo=".", ref=ref, index=index, rewrite=parsed_args.rewrite, ) print(f"Deleted entry {ref.decode('utf-8', 'replace')}@{{{index}}}") class cmd_reset(Command): """Reset current HEAD to the specified state.""" def run(self, args: Sequence[str]) -> None: """Execute the reset command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() mode_group = parser.add_mutually_exclusive_group() mode_group.add_argument( "--hard", action="store_true", help="Reset working tree and index" ) mode_group.add_argument("--soft", action="store_true", help="Reset only HEAD") mode_group.add_argument( "--mixed", action="store_true", help="Reset HEAD and index" ) parser.add_argument("treeish", nargs="?", help="Commit/tree to reset to") parsed_args = parser.parse_args(args) if parsed_args.hard: mode = "hard" elif parsed_args.soft: mode = "soft" elif parsed_args.mixed: mode = "mixed" else: # Default to mixed behavior mode = "mixed" # Use the porcelain.reset function for all modes porcelain.reset(".", mode=mode, treeish=parsed_args.treeish) class cmd_revert(Command): """Revert some existing commits.""" def run(self, args: Sequence[str]) -> None: """Execute the revert command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--no-commit", "-n", action="store_true", help="Apply changes but don't create a commit", ) parser.add_argument("-m", "--message", help="Custom commit message") parser.add_argument("commits", nargs="+", help="Commits to revert") parsed_args = parser.parse_args(args) result = porcelain.revert( ".", commits=parsed_args.commits, no_commit=parsed_args.no_commit, message=parsed_args.message, ) if result and not parsed_args.no_commit: logger.info("[%s] Revert completed", result.decode("ascii")[:7]) class cmd_daemon(Command): """Run a simple Git protocol server.""" def run(self, args: Sequence[str]) -> None: """Execute the daemon command. Args: args: Command line arguments """ from dulwich import log_utils from .protocol import TCP_GIT_PORT parser = argparse.ArgumentParser() parser.add_argument( "-l", "--listen_address", default="localhost", help="Binding IP address.", ) parser.add_argument( "-p", "--port", type=int, default=TCP_GIT_PORT, help="Binding TCP port.", ) parser.add_argument( "gitdir", nargs="?", default=".", help="Git directory to serve" ) parsed_args = parser.parse_args(args) log_utils.default_logging_config() porcelain.daemon( parsed_args.gitdir, address=parsed_args.listen_address, port=parsed_args.port, ) class cmd_web_daemon(Command): """Run a simple HTTP server for Git repositories.""" def run(self, args: Sequence[str]) -> None: """Execute the web-daemon command. Args: args: Command line arguments """ from dulwich import log_utils parser = argparse.ArgumentParser() parser.add_argument( "-l", "--listen_address", default="", help="Binding IP address.", ) parser.add_argument( "-p", "--port", type=int, default=8000, help="Binding TCP port.", ) parser.add_argument( "gitdir", nargs="?", default=".", help="Git directory to serve" ) parsed_args = parser.parse_args(args) log_utils.default_logging_config() porcelain.web_daemon( parsed_args.gitdir, address=parsed_args.listen_address, port=parsed_args.port, ) class cmd_write_tree(Command): """Create a tree object from the current index.""" def run(self, args: Sequence[str]) -> None: """Execute the write-tree command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) sys.stdout.write("{}\n".format(porcelain.write_tree(".").decode())) class cmd_receive_pack(Command): """Receive what is pushed into the repository.""" def run(self, args: Sequence[str]) -> None: """Execute the receive-pack command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("gitdir", nargs="?", default=".", help="Git directory") parsed_args = parser.parse_args(args) porcelain.receive_pack(parsed_args.gitdir) class cmd_upload_pack(Command): """Send objects packed back to git-fetch-pack.""" def run(self, args: Sequence[str]) -> None: """Execute the upload-pack command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("gitdir", nargs="?", default=".", help="Git directory") parsed_args = parser.parse_args(args) porcelain.upload_pack(parsed_args.gitdir) class cmd_shortlog(Command): """Show a shortlog of commits by author.""" def run(self, args: Sequence[str]) -> None: """Execute the shortlog command with the given CLI arguments. Args: args: List of command line arguments. """ parser = argparse.ArgumentParser() parser.add_argument("gitdir", nargs="?", default=".", help="Git directory") parser.add_argument("--summary", action="store_true", help="Show summary only") parser.add_argument( "--sort", action="store_true", help="Sort authors by commit count" ) parsed_args = parser.parse_args(args) shortlog_items: list[dict[str, str]] = porcelain.shortlog( repo=parsed_args.gitdir, summary_only=parsed_args.summary, sort_by_commits=parsed_args.sort, ) for item in shortlog_items: author: str = item["author"] messages: str = item["messages"] if parsed_args.summary: count = len(messages.splitlines()) sys.stdout.write(f"{count}\t{author}\n") else: sys.stdout.write(f"{author} ({len(messages.splitlines())}):\n") for msg in messages.splitlines(): sys.stdout.write(f" {msg}\n") sys.stdout.write("\n") class cmd_status(Command): """Show the working tree status.""" def run(self, args: Sequence[str]) -> None: """Execute the status command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("gitdir", nargs="?", default=".", help="Git directory") parser.add_argument( "--column", action="store_true", help="Display untracked files in columns", ) parsed_args = parser.parse_args(args) status = porcelain.status(parsed_args.gitdir) if any(names for (kind, names) in status.staged.items()): sys.stdout.write("Changes to be committed:\n\n") for kind, names in status.staged.items(): for name in names: sys.stdout.write(f"\t{kind}: {os.fsdecode(name)}\n") sys.stdout.write("\n") if status.unstaged: sys.stdout.write("Changes not staged for commit:\n\n") for name in status.unstaged: sys.stdout.write(f"\t{os.fsdecode(name)}\n") sys.stdout.write("\n") if status.untracked: sys.stdout.write("Untracked files:\n\n") if parsed_args.column: # Format untracked files in columns untracked_names = [os.fsdecode(name) for name in status.untracked] output = format_columns(untracked_names, mode="column", indent="\t") sys.stdout.write(output) else: for name in status.untracked: sys.stdout.write(f"\t{os.fsdecode(name)}\n") sys.stdout.write("\n") class cmd_ls_remote(Command): """List references in a remote repository.""" def run(self, args: Sequence[str]) -> None: """Execute the ls-remote command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--symref", action="store_true", help="Show symbolic references" ) parser.add_argument("url", help="Remote URL to list references from") parsed_args = parser.parse_args(args) result = porcelain.ls_remote(parsed_args.url) if parsed_args.symref: # Show symrefs first, like git does for ref, target in sorted(result.symrefs.items()): if target: sys.stdout.write(f"ref: {target.decode()}\t{ref.decode()}\n") # Show regular refs for ref in sorted(result.refs): sha = result.refs[ref] if sha is not None: sys.stdout.write(f"{sha.decode()}\t{ref.decode()}\n") class cmd_ls_tree(Command): """List the contents of a tree object.""" def run(self, args: Sequence[str]) -> None: """Execute the ls-tree command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "-r", "--recursive", action="store_true", help="Recursively list tree contents.", ) parser.add_argument( "--name-only", action="store_true", help="Only display name." ) parser.add_argument("treeish", nargs="?", help="Tree-ish to list") parsed_args = parser.parse_args(args) with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="ls-tree") as outstream: porcelain.ls_tree( repo, parsed_args.treeish, outstream=outstream, recursive=parsed_args.recursive, name_only=parsed_args.name_only, ) class cmd_pack_objects(Command): """Create a packed archive of objects.""" def run(self, args: Sequence[str]) -> None: """Execute the pack-objects command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--stdout", action="store_true", help="Write pack to stdout" ) parser.add_argument("--deltify", action="store_true", help="Create deltas") parser.add_argument( "--no-reuse-deltas", action="store_true", help="Don't reuse existing deltas" ) parser.add_argument("basename", nargs="?", help="Base name for pack files") parsed_args = parser.parse_args(args) if not parsed_args.stdout and not parsed_args.basename: parser.error("basename required when not using --stdout") object_ids = [ObjectID(line.strip().encode()) for line in sys.stdin.readlines()] deltify = parsed_args.deltify reuse_deltas = not parsed_args.no_reuse_deltas if parsed_args.stdout: packf = getattr(sys.stdout, "buffer", sys.stdout) assert isinstance(packf, BinaryIO) idxf = None close = [] else: packf = open(parsed_args.basename + ".pack", "wb") idxf = open(parsed_args.basename + ".idx", "wb") close = [packf, idxf] porcelain.pack_objects( ".", object_ids, packf, idxf, deltify=deltify, reuse_deltas=reuse_deltas ) for f in close: f.close() class cmd_unpack_objects(Command): """Unpack objects from a packed archive.""" def run(self, args: Sequence[str]) -> None: """Execute the unpack-objects command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("pack_file", help="Pack file to unpack") parsed_args = parser.parse_args(args) count = porcelain.unpack_objects(parsed_args.pack_file) logger.info("Unpacked %d objects", count) class cmd_prune(Command): """Prune all unreachable objects from the object database.""" def run(self, args: Sequence[str]) -> int | None: """Execute the prune command. Args: args: Command line arguments """ import datetime import time from dulwich.object_store import DEFAULT_TEMPFILE_GRACE_PERIOD parser = argparse.ArgumentParser( description="Remove temporary pack files left behind by interrupted operations" ) parser.add_argument( "--expire", nargs="?", const="2.weeks.ago", help="Only prune files older than the specified date (default: 2.weeks.ago)", ) parser.add_argument( "--dry-run", "-n", action="store_true", help="Only report what would be removed", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Report all actions", ) parsed_args = parser.parse_args(args) # Parse expire grace period grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD if parsed_args.expire: from .approxidate import parse_relative_time try: grace_period = parse_relative_time(parsed_args.expire) except ValueError: # Try to parse as absolute date try: date = datetime.datetime.strptime(parsed_args.expire, "%Y-%m-%d") grace_period = int(time.time() - date.timestamp()) except ValueError: logger.error("Invalid expire date: %s", parsed_args.expire) return 1 # Progress callback def progress(msg: str) -> None: if parsed_args.verbose: logger.info("%s", msg) try: porcelain.prune( ".", grace_period=grace_period, dry_run=parsed_args.dry_run, progress=progress if parsed_args.verbose else None, ) return None except porcelain.Error as e: logger.error("%s", e) return 1 class cmd_pull(Command): """Fetch from and integrate with another repository or a local branch.""" def run(self, args: Sequence[str]) -> None: """Execute the pull command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("from_location", type=str) parser.add_argument("refspec", type=str, nargs="*") parser.add_argument("--filter", type=str, nargs=1) parser.add_argument("--protocol", type=int) parsed_args = parser.parse_args(args) porcelain.pull( ".", remote_location=parsed_args.from_location or None, refspecs=parsed_args.refspec or None, filter_spec=parsed_args.filter, protocol_version=parsed_args.protocol or None, ) class cmd_push(Command): """Update remote refs along with associated objects.""" def run(self, argv: Sequence[str]) -> int | None: """Execute the push command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("-f", "--force", action="store_true", help="Force") parser.add_argument("to_location", type=str) parser.add_argument("refspec", type=str, nargs="*") args = parser.parse_args(argv) try: porcelain.push( ".", args.to_location, args.refspec or None, force=args.force ) except porcelain.DivergedBranches: sys.stderr.write("Diverged branches; specify --force to override") return 1 return None class cmd_remote_add(Command): """Add a remote repository.""" def run(self, args: Sequence[str]) -> None: """Execute the remote-add command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("name", help="Name of the remote") parser.add_argument("url", help="URL of the remote") parsed_args = parser.parse_args(args) porcelain.remote_add(".", parsed_args.name, parsed_args.url) class SuperCommand(Command): """Base class for commands that have subcommands.""" subcommands: ClassVar[dict[str, type[Command]]] = {} default_command: ClassVar[type[Command] | None] = None def run(self, args: Sequence[str]) -> int | None: """Execute the subcommand command. Args: args: Command line arguments """ if not args: if self.default_command: return self.default_command().run(args) else: logger.info( "Supported subcommands: %s", ", ".join(self.subcommands.keys()) ) return False cmd = args[0] try: cmd_kls = self.subcommands[cmd] except KeyError: logger.error("No such subcommand: %s", args[0]) sys.exit(1) return cmd_kls().run(args[1:]) class cmd_remote(SuperCommand): """Manage set of tracked repositories.""" subcommands: ClassVar[dict[str, type[Command]]] = { "add": cmd_remote_add, } class cmd_submodule_list(Command): """List submodules.""" def run(self, argv: Sequence[str]) -> None: """Execute the submodule-list command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(argv) for path, sha in porcelain.submodule_list("."): sys.stdout.write(f" {sha} {path}\n") class cmd_submodule_init(Command): """Initialize submodules.""" def run(self, argv: Sequence[str]) -> None: """Execute the submodule-init command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(argv) porcelain.submodule_init(".") class cmd_submodule_add(Command): """Add a submodule.""" def run(self, argv: Sequence[str]) -> None: """Execute the submodule-add command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("url", help="URL of repository to add as submodule") parser.add_argument("path", nargs="?", help="Path where submodule should live") parser.add_argument("--name", help="Name for the submodule") args = parser.parse_args(argv) porcelain.submodule_add(".", args.url, args.path, args.name) class cmd_submodule_update(Command): """Update submodules.""" def run(self, argv: Sequence[str]) -> None: """Execute the submodule-update command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--init", action="store_true", help="Initialize submodules first" ) parser.add_argument( "--force", action="store_true", help="Force update even if local changes exist", ) parser.add_argument( "--recursive", action="store_true", help="Recursively update nested submodules", ) parser.add_argument( "paths", nargs="*", help="Specific submodule paths to update" ) args = parser.parse_args(argv) paths = args.paths if args.paths else None porcelain.submodule_update( ".", paths=paths, init=args.init, force=args.force, recursive=args.recursive ) class cmd_submodule(SuperCommand): """Initialize, update or inspect submodules.""" subcommands: ClassVar[dict[str, type[Command]]] = { "add": cmd_submodule_add, "init": cmd_submodule_init, "list": cmd_submodule_list, "update": cmd_submodule_update, } default_command = cmd_submodule_list class cmd_check_ignore(Command): """Check whether files are excluded by gitignore.""" def run(self, args: Sequence[str]) -> int: """Execute the check-ignore command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("paths", nargs="+", help="Paths to check") parsed_args = parser.parse_args(args) ret = 1 for path in porcelain.check_ignore(".", parsed_args.paths): logger.info(path) ret = 0 return ret class cmd_check_mailmap(Command): """Show canonical names and email addresses of contacts.""" def run(self, args: Sequence[str]) -> None: """Execute the check-mailmap command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("identities", nargs="+", help="Identities to check") parsed_args = parser.parse_args(args) for identity in parsed_args.identities: canonical_identity = porcelain.check_mailmap(".", identity) logger.info(canonical_identity) class cmd_branch(Command): """List, create, or delete branches.""" def run(self, args: Sequence[str]) -> int | None: """Execute the branch command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "branch", type=str, nargs="?", help="Name of the branch", ) parser.add_argument( "-d", "--delete", action="store_true", help="Delete branch", ) parser.add_argument("--all", action="store_true", help="List all branches") parser.add_argument( "--merged", action="store_true", help="List merged into current branch" ) parser.add_argument( "--no-merged", action="store_true", help="List branches not merged into current branch", ) parser.add_argument( "--remotes", action="store_true", help="List remotes branches" ) parser.add_argument( "--contains", nargs="?", const="HEAD", help="List branches that contain a specific commit", ) parser.add_argument( "--column", action="store_true", help="Display branch list in columns" ) parser.add_argument( "--list", nargs="?", const=None, help="List branches matching a pattern", ) parsed_args = parser.parse_args(args) def print_branches( branches: Iterator[bytes] | Sequence[bytes], use_columns: bool = False ) -> None: if use_columns: branch_names = [branch.decode() for branch in branches] output = format_columns(branch_names, mode="column") sys.stdout.write(output) else: for branch in branches: sys.stdout.write(f"{branch.decode()}\n") branches: Iterator[bytes] | list[bytes] | None = None try: if parsed_args.all: branches = porcelain.branch_list(".") + porcelain.branch_remotes_list( "." ) elif parsed_args.remotes: branches = porcelain.branch_remotes_list(".") elif parsed_args.merged: branches = porcelain.merged_branches(".") elif parsed_args.no_merged: branches = porcelain.no_merged_branches(".") elif parsed_args.contains: try: branches = list( porcelain.branches_containing(".", commit=parsed_args.contains) ) except KeyError as e: sys.stderr.write( f"error: object name {e.args[0].decode()} not found\n" ) return 1 except porcelain.Error as e: sys.stderr.write(f"{e}") return 1 pattern = parsed_args.list if pattern is not None and branches: branches = porcelain.filter_branches_by_pattern(branches, pattern) if branches is not None: print_branches(branches, parsed_args.column) return 0 if not parsed_args.branch: logger.error("Usage: dulwich branch [-d] BRANCH_NAME") return 1 if parsed_args.delete: porcelain.branch_delete(".", name=parsed_args.branch) else: try: porcelain.branch_create(".", name=parsed_args.branch) except porcelain.Error as e: sys.stderr.write(f"{e}") return 1 return 0 class cmd_checkout(Command): """Switch branches or restore working tree files.""" def run(self, args: Sequence[str]) -> int | None: """Execute the checkout command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "target", type=str, help="Name of the branch, tag, or commit to checkout", ) parser.add_argument( "-f", "--force", action="store_true", help="Force checkout", ) parser.add_argument( "-b", "--new-branch", type=str, help="Create a new branch at the target and switch to it", ) parsed_args = parser.parse_args(args) if not parsed_args.target: logger.error("Usage: dulwich checkout TARGET [--force] [-b NEW_BRANCH]") return 1 try: porcelain.checkout( ".", target=parsed_args.target, force=parsed_args.force, new_branch=parsed_args.new_branch, ) except porcelain.CheckoutError as e: sys.stderr.write(f"{e}\n") return 1 return 0 class cmd_restore(Command): """Restore working tree files.""" def run(self, args: Sequence[str]) -> int | None: """Execute the restore command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "paths", nargs="+", type=str, help="Paths to restore", ) parser.add_argument( "-s", "--source", type=str, help="Restore from a specific commit (default: HEAD for --staged, index for worktree)", ) parser.add_argument( "--staged", action="store_true", help="Restore files in the index", ) parser.add_argument( "--worktree", action="store_true", help="Restore files in the working tree", ) parsed_args = parser.parse_args(args) # If neither --staged nor --worktree is specified, default to --worktree if not parsed_args.staged and not parsed_args.worktree: worktree = True staged = False else: worktree = parsed_args.worktree staged = parsed_args.staged try: porcelain.restore( ".", paths=parsed_args.paths, source=parsed_args.source, staged=staged, worktree=worktree, ) except porcelain.CheckoutError as e: sys.stderr.write(f"{e}\n") return 1 return 0 class cmd_switch(Command): """Switch branches.""" def run(self, args: Sequence[str]) -> int | None: """Execute the switch command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "target", type=str, help="Branch or commit to switch to", ) parser.add_argument( "-c", "--create", type=str, help="Create a new branch at the target and switch to it", ) parser.add_argument( "-f", "--force", action="store_true", help="Force switch even if there are local changes", ) parser.add_argument( "-d", "--detach", action="store_true", help="Switch to a commit in detached HEAD state", ) parsed_args = parser.parse_args(args) if not parsed_args.target: logger.error( "Usage: dulwich switch TARGET [-c NEW_BRANCH] [--force] [--detach]" ) return 1 try: porcelain.switch( ".", target=parsed_args.target, create=parsed_args.create, force=parsed_args.force, detach=parsed_args.detach, ) except porcelain.CheckoutError as e: sys.stderr.write(f"{e}\n") return 1 return 0 class cmd_stash_list(Command): """List stash entries.""" def run(self, args: Sequence[str]) -> None: """Execute the stash-list command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) from .repo import Repo from .stash import Stash with Repo(".") as r: stash = Stash.from_repo(r) for i, entry in enumerate(stash.stashes()): logger.info( "stash@{%d}: %s", i, entry.message.decode("utf-8", "replace").rstrip("\n"), ) class cmd_stash_push(Command): """Save your local modifications to a new stash.""" def run(self, args: Sequence[str]) -> None: """Execute the stash-push command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) porcelain.stash_push(".") logger.info("Saved working directory and index state") class cmd_stash_pop(Command): """Apply a stash and remove it from the stash list.""" def run(self, args: Sequence[str]) -> None: """Execute the stash-pop command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) porcelain.stash_pop(".") logger.info("Restored working directory and index state") class cmd_bisect(SuperCommand): """Use binary search to find the commit that introduced a bug.""" subcommands: ClassVar[dict[str, type[Command]]] = {} def run(self, args: Sequence[str]) -> int | None: """Execute the bisect command. Args: args: Command line arguments """ parser = argparse.ArgumentParser(prog="dulwich bisect") subparsers = parser.add_subparsers(dest="subcommand", help="bisect subcommands") # bisect start start_parser = subparsers.add_parser("start", help="Start a new bisect session") start_parser.add_argument("bad", nargs="?", help="Bad commit") start_parser.add_argument("good", nargs="*", help="Good commit(s)") start_parser.add_argument( "--no-checkout", action="store_true", help="Don't checkout commits during bisect", ) start_parser.add_argument( "--term-bad", default="bad", help="Term to use for bad commits" ) start_parser.add_argument( "--term-good", default="good", help="Term to use for good commits" ) start_parser.add_argument( "--", dest="paths", nargs="*", help="Paths to limit bisect to" ) # bisect bad bad_parser = subparsers.add_parser("bad", help="Mark a commit as bad") bad_parser.add_argument("rev", nargs="?", help="Commit to mark as bad") # bisect good good_parser = subparsers.add_parser("good", help="Mark a commit as good") good_parser.add_argument("rev", nargs="?", help="Commit to mark as good") # bisect skip skip_parser = subparsers.add_parser("skip", help="Skip commits") skip_parser.add_argument("revs", nargs="*", help="Commits to skip") # bisect reset reset_parser = subparsers.add_parser("reset", help="Reset bisect state") reset_parser.add_argument("commit", nargs="?", help="Commit to reset to") # bisect log subparsers.add_parser("log", help="Show bisect log") # bisect replay replay_parser = subparsers.add_parser("replay", help="Replay bisect log") replay_parser.add_argument("logfile", help="Log file to replay") # bisect help subparsers.add_parser("help", help="Show help") parsed_args = parser.parse_args(args) if not parsed_args.subcommand: parser.print_help() return 1 try: if parsed_args.subcommand == "start": next_sha = porcelain.bisect_start( bad=parsed_args.bad, good=parsed_args.good if parsed_args.good else None, paths=parsed_args.paths, no_checkout=parsed_args.no_checkout, term_bad=parsed_args.term_bad, term_good=parsed_args.term_good, ) if next_sha: logger.info( "Bisecting: checking out '%s'", next_sha.decode("ascii") ) elif parsed_args.subcommand == "bad": next_sha = porcelain.bisect_bad(rev=parsed_args.rev) if next_sha: logger.info( "Bisecting: checking out '%s'", next_sha.decode("ascii") ) else: # Bisect complete - find the first bad commit with porcelain.open_repo_closing(".") as r: bad_ref = os.path.join(r.controldir(), "refs", "bisect", "bad") with open(bad_ref, "rb") as f: bad_sha = ObjectID(f.read().strip()) commit = r.object_store[bad_sha] assert isinstance(commit, Commit) message = commit.message.decode( "utf-8", errors="replace" ).split("\n")[0] logger.info( "%s is the first bad commit", bad_sha.decode("ascii") ) logger.info("commit %s", bad_sha.decode("ascii")) logger.info(" %s", message) elif parsed_args.subcommand == "good": next_sha = porcelain.bisect_good(rev=parsed_args.rev) if next_sha: logger.info( "Bisecting: checking out '%s'", next_sha.decode("ascii") ) elif parsed_args.subcommand == "skip": next_sha = porcelain.bisect_skip( revs=parsed_args.revs if parsed_args.revs else None ) if next_sha: logger.info( "Bisecting: checking out '%s'", next_sha.decode("ascii") ) elif parsed_args.subcommand == "reset": porcelain.bisect_reset(commit=parsed_args.commit) logger.info("Bisect reset") elif parsed_args.subcommand == "log": log = porcelain.bisect_log() logger.info(log.rstrip()) elif parsed_args.subcommand == "replay": porcelain.bisect_replay(".", log_file=parsed_args.logfile) logger.info("Replayed bisect log from %s", parsed_args.logfile) elif parsed_args.subcommand == "help": parser.print_help() except porcelain.Error as e: logger.error("%s", e) return 1 except ValueError as e: logger.error("%s", e) return 1 return 0 class cmd_stash(SuperCommand): """Stash the changes in a dirty working directory away.""" subcommands: ClassVar[dict[str, type[Command]]] = { "list": cmd_stash_list, "pop": cmd_stash_pop, "push": cmd_stash_push, } class cmd_ls_files(Command): """Show information about files in the index and working tree.""" def run(self, args: Sequence[str]) -> None: """Execute the ls-files command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) for name in porcelain.ls_files("."): logger.info(name) class cmd_describe(Command): """Give an object a human readable name based on an available ref.""" def run(self, args: Sequence[str]) -> None: """Execute the describe command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) logger.info(porcelain.describe(".")) class cmd_diagnose(Command): """Display diagnostic information about the Python environment.""" def run(self, args: Sequence[str]) -> None: """Execute the diagnose command. Args: args: Command line arguments """ # TODO: Support creating zip files with diagnostic information parser = argparse.ArgumentParser() parser.parse_args(args) # Python version and executable logger.info("Python version: %s", sys.version) logger.info("Python executable: %s", sys.executable) # PYTHONPATH pythonpath = os.environ.get("PYTHONPATH", "") if pythonpath: logger.info("PYTHONPATH: %s", pythonpath) else: logger.info("PYTHONPATH: (not set)") # sys.path logger.info("sys.path:") for path_entry in sys.path: logger.info(" %s", path_entry) # Dulwich version try: import dulwich logger.info("Dulwich version: %s", dulwich.__version__) except AttributeError: logger.info("Dulwich version: (unknown)") # List installed dependencies and their versions logger.info("Installed dependencies:") # Core dependencies dependencies = [ ("urllib3", "core"), ("typing_extensions", "core (Python < 3.12)"), ] # Optional dependencies optional_dependencies = [ ("fastimport", "fastimport"), ("gpg", "pgp"), ("paramiko", "paramiko"), ("rich", "colordiff"), ("merge3", "merge"), ("patiencediff", "patiencediff"), ("atheris", "fuzzing"), ] for dep, dep_type in dependencies + optional_dependencies: try: module = __import__(dep) version = getattr(module, "__version__", "(unknown)") logger.info(" %s: %s [%s]", dep, version, dep_type) except ImportError: logger.info(" %s: (not installed) [%s]", dep, dep_type) class cmd_merge(Command): """Join two or more development histories together.""" def run(self, args: Sequence[str]) -> int | None: """Execute the merge command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("commit", type=str, nargs="+", help="Commit(s) to merge") parser.add_argument( "--no-commit", action="store_true", help="Do not create a merge commit" ) parser.add_argument( "--no-ff", action="store_true", help="Force create a merge commit" ) parser.add_argument("-m", "--message", type=str, help="Merge commit message") parsed_args = parser.parse_args(args) try: # If multiple commits are provided, pass them as a list # If only one commit is provided, pass it as a string if len(parsed_args.commit) == 1: committish = parsed_args.commit[0] else: committish = parsed_args.commit merge_commit_id, conflicts = porcelain.merge( ".", committish, no_commit=parsed_args.no_commit, no_ff=parsed_args.no_ff, message=parsed_args.message, ) if conflicts: logger.warning("Merge conflicts in %d file(s):", len(conflicts)) for conflict_path in conflicts: logger.warning(" %s", conflict_path.decode()) if len(parsed_args.commit) > 1: logger.error( "Octopus merge failed; refusing to merge with conflicts." ) else: logger.error( "Automatic merge failed; fix conflicts and then commit the result." ) return 1 elif merge_commit_id is None and not parsed_args.no_commit: logger.info("Already up to date.") elif parsed_args.no_commit: logger.info("Automatic merge successful; not committing as requested.") else: assert merge_commit_id is not None if len(parsed_args.commit) > 1: logger.info( "Octopus merge successful. Created merge commit %s", merge_commit_id.decode(), ) else: logger.info( "Merge successful. Created merge commit %s", merge_commit_id.decode(), ) return 0 except porcelain.Error as e: logger.error("%s", e) return 1 class cmd_merge_base(Command): """Find the best common ancestor between commits.""" def run(self, args: Sequence[str]) -> int | None: """Execute the merge-base command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Find the best common ancestor between commits", prog="dulwich merge-base", ) parser.add_argument("commits", nargs="+", help="Commits to find merge base for") parser.add_argument("--all", action="store_true", help="Output all merge bases") parser.add_argument( "--octopus", action="store_true", help="Compute common ancestor of all commits", ) parser.add_argument( "--is-ancestor", action="store_true", help="Check if first commit is ancestor of second", ) parser.add_argument( "--independent", action="store_true", help="List commits not reachable from others", ) parsed_args = parser.parse_args(args) try: if parsed_args.is_ancestor: if len(parsed_args.commits) != 2: logger.error("--is-ancestor requires exactly two commits") return 1 is_anc = porcelain.is_ancestor( ".", ancestor=parsed_args.commits[0], descendant=parsed_args.commits[1], ) return 0 if is_anc else 1 elif parsed_args.independent: commits = porcelain.independent_commits(".", parsed_args.commits) for commit_id in commits: print(commit_id.decode()) return 0 else: if len(parsed_args.commits) < 2: logger.error("At least two commits are required") return 1 merge_bases = porcelain.merge_base( ".", parsed_args.commits, all=parsed_args.all, octopus=parsed_args.octopus, ) for commit_id in merge_bases: print(commit_id.decode()) return 0 except (ValueError, KeyError) as e: logger.error("%s", e) return 1 class cmd_notes_add(Command): """Add notes to a commit.""" def run(self, args: Sequence[str]) -> None: """Execute the notes-add command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("object", help="Object to annotate") parser.add_argument("-m", "--message", help="Note message", required=True) parser.add_argument( "--ref", default="commits", help="Notes ref (default: commits)" ) parsed_args = parser.parse_args(args) porcelain.notes_add( ".", parsed_args.object, parsed_args.message, ref=parsed_args.ref ) class cmd_notes_show(Command): """Show notes for a commit.""" def run(self, args: Sequence[str]) -> None: """Execute the notes-show command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("object", help="Object to show notes for") parser.add_argument( "--ref", default="commits", help="Notes ref (default: commits)" ) parsed_args = parser.parse_args(args) note = porcelain.notes_show(".", parsed_args.object, ref=parsed_args.ref) if note: sys.stdout.buffer.write(note) else: logger.info("No notes found for object %s", parsed_args.object) class cmd_notes_remove(Command): """Remove notes for a commit.""" def run(self, args: Sequence[str]) -> None: """Execute the notes-remove command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("object", help="Object to remove notes from") parser.add_argument( "--ref", default="commits", help="Notes ref (default: commits)" ) parsed_args = parser.parse_args(args) result = porcelain.notes_remove(".", parsed_args.object, ref=parsed_args.ref) if result: logger.info("Removed notes for object %s", parsed_args.object) else: logger.info("No notes found for object %s", parsed_args.object) class cmd_notes_list(Command): """List all note objects.""" def run(self, args: Sequence[str]) -> None: """Execute the notes-list command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "--ref", default="commits", help="Notes ref (default: commits)" ) parsed_args = parser.parse_args(args) notes = porcelain.notes_list(".", ref=parsed_args.ref) for object_sha, note_content in notes: logger.info(object_sha.hex()) class cmd_notes(SuperCommand): """Add or inspect object notes.""" subcommands: ClassVar[dict[str, type[Command]]] = { "add": cmd_notes_add, "show": cmd_notes_show, "remove": cmd_notes_remove, "list": cmd_notes_list, } default_command = cmd_notes_list class cmd_replace_list(Command): """List all replacement refs.""" def run(self, args: Sequence[str]) -> None: """Execute the replace-list command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.parse_args(args) replacements = porcelain.replace_list(".") for object_sha, replacement_sha in replacements: sys.stdout.write( f"{object_sha.decode('ascii')} -> {replacement_sha.decode('ascii')}\n" ) class cmd_replace_delete(Command): """Delete a replacement ref.""" def run(self, args: Sequence[str]) -> int | None: """Execute the replace-delete command. Args: args: Command line arguments Returns: Exit code (0 for success, 1 for error) """ parser = argparse.ArgumentParser() parser.add_argument("object", help="Object whose replacement should be removed") parsed_args = parser.parse_args(args) try: porcelain.replace_delete(".", parsed_args.object) logger.info("Deleted replacement for %s", parsed_args.object) return None except KeyError as e: logger.error(str(e)) return 1 class cmd_replace(SuperCommand): """Create, list, and delete replacement refs.""" subcommands: ClassVar[dict[str, type[Command]]] = { "list": cmd_replace_list, "delete": cmd_replace_delete, } default_command = cmd_replace_list def run(self, args: Sequence[str]) -> int | None: """Execute the replace command. Args: args: Command line arguments Returns: Exit code (0 for success, 1 for error) """ # Special case: if we have exactly 2 args and no subcommand, treat as create if len(args) == 2 and args[0] not in self.subcommands: # This is the create form: git replace parser = argparse.ArgumentParser() parser.add_argument("object", help="Object to replace") parser.add_argument("replacement", help="Replacement object") parsed_args = parser.parse_args(args) porcelain.replace_create(".", parsed_args.object, parsed_args.replacement) logger.info( "Created replacement: %s -> %s", parsed_args.object, parsed_args.replacement, ) return None # Otherwise, delegate to supercommand handling return super().run(args) class cmd_cherry(Command): """Find commits not merged upstream.""" def run(self, args: Sequence[str]) -> int | None: """Execute the cherry command. Args: args: Command line arguments Returns: Exit code (0 for success, 1 for error) """ parser = argparse.ArgumentParser(description="Find commits not merged upstream") parser.add_argument( "-v", "--verbose", action="store_true", help="Show commit messages", ) parser.add_argument( "upstream", nargs="?", help="Upstream branch (default: tracking branch or HEAD^)", ) parser.add_argument( "head", nargs="?", help="Head branch (default: HEAD)", ) parser.add_argument( "limit", nargs="?", help="Limit commits to those after this ref", ) parsed_args = parser.parse_args(args) try: results = porcelain.cherry( ".", upstream=parsed_args.upstream, head=parsed_args.head, limit=parsed_args.limit, verbose=parsed_args.verbose, ) except (NotGitRepository, OSError, FileFormatException, ValueError) as e: logger.error(f"Error: {e}") return 1 # Output results for status, commit_sha, message in results: # Convert commit_sha to hex string if isinstance(commit_sha, bytes): commit_hex = commit_sha.hex() else: commit_hex = commit_sha if parsed_args.verbose and message: message_str = message.decode("utf-8", errors="replace") logger.info(f"{status} {commit_hex} {message_str}") else: logger.info(f"{status} {commit_hex}") return 0 class cmd_cherry_pick(Command): """Apply the changes introduced by some existing commits.""" def run(self, args: Sequence[str]) -> int | None: """Execute the cherry-pick command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Apply the changes introduced by some existing commits" ) parser.add_argument("commit", nargs="?", help="Commit to cherry-pick") parser.add_argument( "-n", "--no-commit", action="store_true", help="Apply changes without making a commit", ) parser.add_argument( "--continue", dest="continue_", action="store_true", help="Continue after resolving conflicts", ) parser.add_argument( "--abort", action="store_true", help="Abort the current cherry-pick operation", ) parsed_args = parser.parse_args(args) # Check argument validity if parsed_args.continue_ or parsed_args.abort: if parsed_args.commit is not None: parser.error("Cannot specify commit with --continue or --abort") return 1 else: if parsed_args.commit is None: parser.error("Commit argument is required") return 1 try: commit_arg = parsed_args.commit result = porcelain.cherry_pick( ".", commit_arg, no_commit=parsed_args.no_commit, continue_=parsed_args.continue_, abort=parsed_args.abort, ) if parsed_args.abort: logger.info("Cherry-pick aborted.") elif parsed_args.continue_: if result: logger.info("Cherry-pick completed: %s", result.decode()) else: logger.info("Cherry-pick completed.") elif result is None: if parsed_args.no_commit: logger.info("Cherry-pick applied successfully (no commit created).") else: # This shouldn't happen unless there were conflicts logger.warning("Cherry-pick resulted in conflicts.") else: logger.info("Cherry-pick successful: %s", result.decode()) return None except porcelain.Error as e: logger.error("%s", e) return 1 class cmd_merge_tree(Command): """Show three-way merge without touching index.""" def run(self, args: Sequence[str]) -> int | None: """Execute the merge-tree command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Perform a tree-level merge without touching the working directory" ) parser.add_argument( "base_tree", nargs="?", help="The common ancestor tree (optional, defaults to empty tree)", ) parser.add_argument("our_tree", help="Our side of the merge") parser.add_argument("their_tree", help="Their side of the merge") parser.add_argument( "-z", "--name-only", action="store_true", help="Output only conflict paths, null-terminated", ) parsed_args = parser.parse_args(args) try: # Determine base tree - if only two parsed_args provided, base is None if parsed_args.base_tree is None: # Only two arguments provided base_tree = None our_tree = parsed_args.our_tree their_tree = parsed_args.their_tree else: # Three arguments provided base_tree = parsed_args.base_tree our_tree = parsed_args.our_tree their_tree = parsed_args.their_tree merged_tree_id, conflicts = porcelain.merge_tree( ".", base_tree, our_tree, their_tree ) if parsed_args.name_only: # Output only conflict paths, null-terminated for conflict_path in conflicts: sys.stdout.buffer.write(conflict_path) sys.stdout.buffer.write(b"\0") else: # Output the merged tree SHA logger.info(merged_tree_id.decode("ascii")) # Output conflict information if conflicts: logger.warning("\nConflicts in %d file(s):", len(conflicts)) for conflict_path in conflicts: logger.warning(" %s", conflict_path.decode()) return None except porcelain.Error as e: logger.error("%s", e) return 1 except KeyError as e: logger.error("Object not found: %s", e) return 1 class cmd_gc(Command): """Cleanup unnecessary files and optimize the local repository.""" def run(self, args: Sequence[str]) -> int | None: """Execute the gc command. Args: args: Command line arguments """ import datetime import time parser = argparse.ArgumentParser() parser.add_argument( "--auto", action="store_true", help="Only run gc if needed", ) parser.add_argument( "--aggressive", action="store_true", help="Use more aggressive settings", ) parser.add_argument( "--no-prune", action="store_true", help="Do not prune unreachable objects", ) parser.add_argument( "--prune", nargs="?", const="now", help="Prune unreachable objects older than date (default: 2 weeks ago)", ) parser.add_argument( "--dry-run", "-n", action="store_true", help="Only report what would be done", ) parser.add_argument( "--quiet", "-q", action="store_true", help="Only report errors", ) parsed_args = parser.parse_args(args) # Parse prune grace period grace_period = None if parsed_args.prune: from .approxidate import parse_relative_time try: grace_period = parse_relative_time(parsed_args.prune) except ValueError: # Try to parse as absolute date try: date = datetime.datetime.strptime(parsed_args.prune, "%Y-%m-%d") grace_period = int(time.time() - date.timestamp()) except ValueError: logger.error("Invalid prune date: %s", parsed_args.prune) return 1 elif not parsed_args.no_prune: # Default to 2 weeks grace_period = 1209600 # Progress callback def progress(msg: str) -> None: if not parsed_args.quiet: logger.info(msg) try: stats = porcelain.gc( ".", auto=parsed_args.auto, aggressive=parsed_args.aggressive, prune=not parsed_args.no_prune, grace_period=grace_period, dry_run=parsed_args.dry_run, progress=progress if not parsed_args.quiet else None, ) # Report results if not parsed_args.quiet: if parsed_args.dry_run: logger.info("\nDry run results:") if not parsed_args.quiet: if parsed_args.dry_run: print("\nDry run results:") else: logger.info("\nGarbage collection complete:") if stats.pruned_objects: logger.info( " Pruned %d unreachable objects", len(stats.pruned_objects) ) logger.info(" Freed %s", format_bytes(stats.bytes_freed)) if stats.packs_before != stats.packs_after: logger.info( " Reduced pack files from %d to %d", stats.packs_before, stats.packs_after, ) except porcelain.Error as e: logger.error("%s", e) return 1 return None class cmd_maintenance(Command): """Run tasks to optimize Git repository data.""" def run(self, args: Sequence[str]) -> int | None: """Execute the maintenance command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Run tasks to optimize Git repository data" ) subparsers = parser.add_subparsers( dest="subcommand", help="Maintenance subcommand" ) # maintenance run subcommand run_parser = subparsers.add_parser("run", help="Run maintenance tasks") run_parser.add_argument( "--task", action="append", dest="tasks", help="Run a specific task (can be specified multiple times)", ) run_parser.add_argument( "--auto", action="store_true", help="Only run tasks if needed", ) run_parser.add_argument( "--quiet", "-q", action="store_true", help="Only report errors", ) # maintenance start subcommand (placeholder) subparsers.add_parser("start", help="Start background maintenance") # maintenance stop subcommand (placeholder) subparsers.add_parser("stop", help="Stop background maintenance") # maintenance register subcommand subparsers.add_parser("register", help="Register repository for maintenance") # maintenance unregister subcommand unregister_parser = subparsers.add_parser( "unregister", help="Unregister repository from maintenance" ) unregister_parser.add_argument( "--force", action="store_true", help="Don't error if repository is not registered", ) parsed_args = parser.parse_args(args) if not parsed_args.subcommand: parser.print_help() return 1 if parsed_args.subcommand == "run": # Progress callback def progress(msg: str) -> None: if not parsed_args.quiet: logger.info(msg) try: result = porcelain.maintenance_run( ".", tasks=parsed_args.tasks, auto=parsed_args.auto, progress=progress if not parsed_args.quiet else None, ) # Report results if not parsed_args.quiet: if result.tasks_succeeded: logger.info("\nSuccessfully completed tasks:") for task in result.tasks_succeeded: logger.info(f" - {task}") if result.tasks_failed: logger.error("\nFailed tasks:") for task in result.tasks_failed: error_msg = result.errors.get(task, "Unknown error") logger.error(f" - {task}: {error_msg}") return 1 except porcelain.Error as e: logger.error("%s", e) return 1 elif parsed_args.subcommand == "register": porcelain.maintenance_register(".") logger.info("Repository registered for background maintenance") elif parsed_args.subcommand == "unregister": try: force = getattr(parsed_args, "force", False) porcelain.maintenance_unregister(".", force=force) except ValueError as e: logger.error(str(e)) return 1 logger.info("Repository unregistered from background maintenance") elif parsed_args.subcommand in ("start", "stop"): # TODO: Implement background maintenance scheduling logger.error( f"The '{parsed_args.subcommand}' subcommand is not yet implemented" ) return 1 else: parser.print_help() return 1 return None class cmd_grep(Command): """Search for patterns in tracked files.""" def run(self, args: Sequence[str]) -> None: """Execute the grep command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("pattern", help="Regular expression pattern to search for") parser.add_argument( "revision", nargs="?", default=None, help="Revision to search (defaults to HEAD)", ) parser.add_argument( "pathspecs", nargs="*", help="Path patterns to limit search", ) parser.add_argument( "-i", "--ignore-case", action="store_true", help="Perform case-insensitive matching", ) parser.add_argument( "-n", "--line-number", action="store_true", help="Show line numbers for matches", ) parser.add_argument( "--max-depth", type=int, default=None, help="Maximum directory depth to search", ) parser.add_argument( "--no-ignore", action="store_true", help="Do not respect .gitignore patterns", ) parsed_args = parser.parse_args(args) # Handle the case where revision might be a pathspec revision = parsed_args.revision pathspecs = parsed_args.pathspecs # If revision looks like a pathspec (contains wildcards or slashes), # treat it as a pathspec instead if revision and ("*" in revision or "/" in revision or "." in revision): pathspecs = [revision, *pathspecs] revision = None with Repo(".") as repo: config = repo.get_config_stack() with get_pager(config=config, cmd_name="grep") as outstream: porcelain.grep( repo, parsed_args.pattern, outstream=outstream, rev=revision, pathspecs=pathspecs if pathspecs else None, ignore_case=parsed_args.ignore_case, line_number=parsed_args.line_number, max_depth=parsed_args.max_depth, respect_ignores=not parsed_args.no_ignore, ) class cmd_count_objects(Command): """Count unpacked number of objects and their disk consumption.""" def run(self, args: Sequence[str]) -> None: """Execute the count-objects command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "-v", "--verbose", action="store_true", help="Display verbose information.", ) parsed_args = parser.parse_args(args) if parsed_args.verbose: stats = porcelain.count_objects(".", verbose=True) # Display verbose output logger.info("count: %d", stats.count) logger.info("size: %d", stats.size // 1024) # Size in KiB assert stats.in_pack is not None logger.info("in-pack: %d", stats.in_pack) assert stats.packs is not None logger.info("packs: %d", stats.packs) assert stats.size_pack is not None logger.info("size-pack: %d", stats.size_pack // 1024) # Size in KiB else: # Simple output stats = porcelain.count_objects(".", verbose=False) logger.info("%d objects, %d kilobytes", stats.count, stats.size // 1024) class cmd_rebase(Command): """Reapply commits on top of another base tip.""" def run(self, args: Sequence[str]) -> int: """Execute the rebase command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "upstream", nargs="?", help="Upstream branch to rebase onto" ) parser.add_argument("--onto", type=str, help="Rebase onto specific commit") parser.add_argument( "--branch", type=str, help="Branch to rebase (default: current)" ) parser.add_argument( "-i", "--interactive", action="store_true", help="Interactive rebase" ) parser.add_argument( "--edit-todo", action="store_true", help="Edit the todo list during an interactive rebase", ) parser.add_argument( "--abort", action="store_true", help="Abort an in-progress rebase" ) parser.add_argument( "--continue", dest="continue_rebase", action="store_true", help="Continue an in-progress rebase", ) parser.add_argument( "--skip", action="store_true", help="Skip current commit and continue" ) parsed_args = parser.parse_args(args) # Handle abort/continue/skip first if parsed_args.abort: try: porcelain.rebase(".", parsed_args.upstream or "HEAD", abort=True) logger.info("Rebase aborted.") except porcelain.Error as e: logger.error("%s", e) return 1 return 0 if parsed_args.continue_rebase: try: # Check if interactive rebase is in progress if porcelain.is_interactive_rebase("."): result = porcelain.rebase( ".", parsed_args.upstream or "HEAD", continue_rebase=True, interactive=True, ) if result: logger.info("Rebase complete.") else: logger.info("Rebase paused. Use --continue to resume.") else: new_shas = porcelain.rebase( ".", parsed_args.upstream or "HEAD", continue_rebase=True ) logger.info("Rebase complete.") except porcelain.Error as e: logger.error("%s", e) return 1 return 0 if parsed_args.edit_todo: # Edit todo list for interactive rebase try: porcelain.rebase(".", parsed_args.upstream or "HEAD", edit_todo=True) logger.info("Todo list updated.") except porcelain.Error as e: logger.error("%s", e) return 1 return 0 # Normal rebase requires upstream if not parsed_args.upstream: logger.error("Missing required argument 'upstream'") return 1 try: if parsed_args.interactive: # Interactive rebase result = porcelain.rebase( ".", parsed_args.upstream, onto=parsed_args.onto, branch=parsed_args.branch, interactive=True, ) if result: logger.info( "Interactive rebase started. Edit the todo list and save." ) else: logger.info("No commits to rebase.") else: # Regular rebase new_shas = porcelain.rebase( ".", parsed_args.upstream, onto=parsed_args.onto, branch=parsed_args.branch, ) if new_shas: logger.info("Successfully rebased %d commits.", len(new_shas)) else: logger.info("Already up to date.") return 0 except porcelain.Error as e: logger.error("%s", e) return 1 class cmd_filter_branch(Command): """Rewrite branches.""" def run(self, args: Sequence[str]) -> int | None: """Execute the filter-branch command. Args: args: Command line arguments """ import subprocess parser = argparse.ArgumentParser(description="Rewrite branches") # Supported Git-compatible options parser.add_argument( "--subdirectory-filter", type=str, help="Only include history for subdirectory", ) parser.add_argument("--env-filter", type=str, help="Environment filter command") parser.add_argument("--tree-filter", type=str, help="Tree filter command") parser.add_argument("--index-filter", type=str, help="Index filter command") parser.add_argument("--parent-filter", type=str, help="Parent filter command") parser.add_argument("--msg-filter", type=str, help="Message filter command") parser.add_argument("--commit-filter", type=str, help="Commit filter command") parser.add_argument( "--tag-name-filter", type=str, help="Tag name filter command" ) parser.add_argument( "--prune-empty", action="store_true", help="Remove empty commits" ) parser.add_argument( "--original", type=str, default="refs/original", help="Namespace for original refs", ) parser.add_argument( "-f", "--force", action="store_true", help="Force operation even if refs/original/* exists", ) # Branch/ref to rewrite (defaults to HEAD) parser.add_argument( "branch", nargs="?", default="HEAD", help="Branch or ref to rewrite" ) parsed_args = parser.parse_args(args) # Track if any filter fails filter_error = False # Setup environment for filters env = os.environ.copy() # Helper function to run shell commands def run_filter( cmd: str, input_data: bytes | None = None, cwd: str | None = None, extra_env: dict[str, str] | None = None, ) -> bytes | None: nonlocal filter_error filter_env = env.copy() if extra_env: filter_env.update(extra_env) result = subprocess.run( cmd, shell=True, input=input_data, cwd=cwd, env=filter_env, capture_output=True, ) if result.returncode != 0: filter_error = True return None return result.stdout # Create filter functions based on arguments filter_message = None if parsed_args.msg_filter: def filter_message(message: bytes) -> bytes: result = run_filter(parsed_args.msg_filter, input_data=message) return result if result is not None else message tree_filter = None if parsed_args.tree_filter: def tree_filter(tree_sha: ObjectID, tmpdir: str) -> ObjectID: from dulwich.objects import Blob, Tree # Export tree to tmpdir with Repo(".") as r: tree = r.object_store[tree_sha] assert isinstance(tree, Tree) for entry in tree.iteritems(): assert entry.path is not None assert entry.sha is not None path = Path(tmpdir) / entry.path.decode() obj = r.object_store[entry.sha] if isinstance(obj, Tree): path.mkdir(exist_ok=True) else: assert isinstance(obj, Blob) path.write_bytes(obj.data) # Run the filter command in the temp directory run_filter(parsed_args.tree_filter, cwd=tmpdir) # Rebuild tree from modified temp directory def build_tree_from_dir(dir_path: str) -> ObjectID: tree = Tree() for name in sorted(os.listdir(dir_path)): if name.startswith("."): continue path = os.path.join(dir_path, name) if os.path.isdir(path): subtree_sha = build_tree_from_dir(path) tree.add(name.encode(), 0o040000, subtree_sha) else: with open(path, "rb") as f: data = f.read() blob = Blob.from_string(data) r.object_store.add_object(blob) # Use appropriate file mode mode = os.stat(path).st_mode if mode & 0o100: file_mode = 0o100755 else: file_mode = 0o100644 tree.add(name.encode(), file_mode, blob.id) r.object_store.add_object(tree) return tree.id return build_tree_from_dir(tmpdir) index_filter = None if parsed_args.index_filter: def index_filter(tree_sha: ObjectID, index_path: str) -> ObjectID | None: run_filter( parsed_args.index_filter, extra_env={"GIT_INDEX_FILE": index_path} ) return None # Read back from index parent_filter = None if parsed_args.parent_filter: def parent_filter(parents: Sequence[ObjectID]) -> list[ObjectID]: parent_str = " ".join(p.hex() for p in parents) result = run_filter( parsed_args.parent_filter, input_data=parent_str.encode() ) if result is None: return list(parents) output = result.decode().strip() if not output: return [] new_parents = [] for sha in output.split(): sha_bytes = sha.encode() if valid_hexsha(sha_bytes): new_parents.append(ObjectID(sha_bytes)) return new_parents commit_filter = None if parsed_args.commit_filter: def commit_filter( commit_obj: Commit, tree_sha: ObjectID ) -> ObjectID | None: # The filter receives: tree parent1 parent2... cmd_input = tree_sha.hex() for parent in commit_obj.parents: cmd_input += " " + parent.hex() result = run_filter( parsed_args.commit_filter, input_data=cmd_input.encode(), extra_env={"GIT_COMMIT": commit_obj.id.hex()}, ) if result is None: return None output = result.decode().strip() if not output: return None # Skip commit if valid_hexsha(output): return ObjectID(output.encode()) return None tag_name_filter = None if parsed_args.tag_name_filter: def tag_name_filter(tag_name: bytes) -> bytes: result = run_filter(parsed_args.tag_name_filter, input_data=tag_name) return result if result is not None else tag_name # Open repo once with Repo(".") as r: # Check for refs/original if not forcing if not parsed_args.force: original_prefix = parsed_args.original.encode() + b"/" for ref in r.refs.allkeys(): if ref.startswith(original_prefix): logger.error("Cannot create a new backup.") logger.error( "A previous backup already exists in %s/", parsed_args.original, ) logger.error("Force overwriting the backup with -f") print("Cannot create a new backup.") print( f"A previous backup already exists in {parsed_args.original}/" ) print("Force overwriting the backup with -f") return 1 try: # Call porcelain.filter_branch with the repo object result = porcelain.filter_branch( r, parsed_args.branch, filter_message=filter_message, tree_filter=tree_filter if parsed_args.tree_filter else None, index_filter=index_filter if parsed_args.index_filter else None, parent_filter=parent_filter if parsed_args.parent_filter else None, commit_filter=commit_filter if parsed_args.commit_filter else None, subdirectory_filter=parsed_args.subdirectory_filter, prune_empty=parsed_args.prune_empty, tag_name_filter=tag_name_filter if parsed_args.tag_name_filter else None, force=parsed_args.force, keep_original=True, # Always keep original with git ) # Check if any filter failed if filter_error: logger.error("Filter command failed") return 1 # Git filter-branch shows progress if result: logger.info( "Rewrite %s (%d commits)", parsed_args.branch, len(result) ) # Git shows: Ref 'refs/heads/branch' was rewritten if parsed_args.branch != "HEAD": ref_name = ( parsed_args.branch if parsed_args.branch.startswith("refs/") else f"refs/heads/{parsed_args.branch}" ) logger.info("Ref '%s' was rewritten", ref_name) return 0 except porcelain.Error as e: logger.error("%s", e) return 1 class cmd_lfs(Command): """Git Large File Storage management.""" """Git LFS management commands.""" def run(self, argv: Sequence[str]) -> None: """Execute the lfs command. Args: argv: Command line arguments """ parser = argparse.ArgumentParser(prog="dulwich lfs") subparsers = parser.add_subparsers(dest="subcommand", help="LFS subcommands") # lfs init subparsers.add_parser("init", help="Initialize Git LFS") # lfs track parser_track = subparsers.add_parser( "track", help="Track file patterns with LFS" ) parser_track.add_argument("patterns", nargs="*", help="File patterns to track") # lfs untrack parser_untrack = subparsers.add_parser( "untrack", help="Untrack file patterns from LFS" ) parser_untrack.add_argument( "patterns", nargs="+", help="File patterns to untrack" ) # lfs ls-files parser_ls = subparsers.add_parser("ls-files", help="List LFS files") parser_ls.add_argument("--ref", help="Git ref to check (defaults to HEAD)") # lfs migrate parser_migrate = subparsers.add_parser("migrate", help="Migrate files to LFS") parser_migrate.add_argument("--include", nargs="+", help="Patterns to include") parser_migrate.add_argument("--exclude", nargs="+", help="Patterns to exclude") parser_migrate.add_argument( "--everything", action="store_true", help="Migrate all files above 100MB" ) # lfs pointer parser_pointer = subparsers.add_parser("pointer", help="Check LFS pointers") parser_pointer.add_argument( "--check", nargs="*", dest="paths", help="Check if files are LFS pointers" ) # lfs clean parser_clean = subparsers.add_parser("clean", help="Clean file to LFS pointer") parser_clean.add_argument("path", help="File path to clean") # lfs smudge parser_smudge = subparsers.add_parser( "smudge", help="Smudge LFS pointer to content" ) parser_smudge.add_argument( "--stdin", action="store_true", help="Read pointer from stdin" ) # lfs fetch parser_fetch = subparsers.add_parser( "fetch", help="Fetch LFS objects from remote" ) parser_fetch.add_argument( "--remote", default="origin", help="Remote to fetch from" ) parser_fetch.add_argument("refs", nargs="*", help="Specific refs to fetch") # lfs pull parser_pull = subparsers.add_parser( "pull", help="Pull LFS objects for current checkout" ) parser_pull.add_argument( "--remote", default="origin", help="Remote to pull from" ) # lfs push parser_push = subparsers.add_parser("push", help="Push LFS objects to remote") parser_push.add_argument("--remote", default="origin", help="Remote to push to") parser_push.add_argument("refs", nargs="*", help="Specific refs to push") # lfs status subparsers.add_parser("status", help="Show status of LFS files") args = parser.parse_args(argv) if args.subcommand == "init": porcelain.lfs_init() logger.info("Git LFS initialized.") elif args.subcommand == "track": if args.patterns: tracked = porcelain.lfs_track(patterns=args.patterns) logger.info("Tracking patterns:") else: tracked = porcelain.lfs_track() logger.info("Currently tracked patterns:") for pattern in tracked: logger.info(" %s", pattern) elif args.subcommand == "untrack": tracked = porcelain.lfs_untrack(patterns=args.patterns) logger.info("Remaining tracked patterns:") for pattern in tracked: logger.info(" %s", to_display_str(pattern)) elif args.subcommand == "ls-files": files = porcelain.lfs_ls_files(ref=args.ref) for path, oid, size in files: logger.info( "%s * %s (%s)", to_display_str(oid[:12]), to_display_str(path), format_bytes(size), ) elif args.subcommand == "migrate": count = porcelain.lfs_migrate( include=args.include, exclude=args.exclude, everything=args.everything ) logger.info("Migrated %d file(s) to Git LFS.", count) elif args.subcommand == "pointer": if args.paths is not None: results = porcelain.lfs_pointer_check(paths=args.paths or None) for file_path, pointer in results.items(): if pointer: logger.info( "%s: LFS pointer (oid: %s, size: %s)", to_display_str(file_path), to_display_str(pointer.oid[:12]), format_bytes(pointer.size), ) else: logger.warning( "%s: Not an LFS pointer", to_display_str(file_path) ) elif args.subcommand == "clean": pointer = porcelain.lfs_clean(path=args.path) sys.stdout.buffer.write(pointer) elif args.subcommand == "smudge": if args.stdin: pointer_content = sys.stdin.buffer.read() content = porcelain.lfs_smudge(pointer_content=pointer_content) sys.stdout.buffer.write(content) else: logger.error("--stdin required for smudge command") sys.exit(1) elif args.subcommand == "fetch": refs = args.refs or None count = porcelain.lfs_fetch(remote=args.remote, refs=refs) logger.info("Fetched %d LFS object(s).", count) elif args.subcommand == "pull": count = porcelain.lfs_pull(remote=args.remote) logger.info("Pulled %d LFS object(s).", count) elif args.subcommand == "push": refs = args.refs or None count = porcelain.lfs_push(remote=args.remote, refs=refs) logger.info("Pushed %d LFS object(s).", count) elif args.subcommand == "status": status = porcelain.lfs_status() if status["tracked"]: logger.info("LFS tracked files: %d", len(status["tracked"])) if status["missing"]: logger.warning("\nMissing LFS objects:") for file_path in status["missing"]: logger.warning(" %s", to_display_str(file_path)) if status["not_staged"]: logger.info("\nModified LFS files not staged:") for file_path in status["not_staged"]: logger.warning(" %s", to_display_str(file_path)) if not any(status.values()): logger.info("No LFS files found.") else: parser.print_help() sys.exit(1) class cmd_help(Command): """Display help information about git.""" def run(self, args: Sequence[str]) -> None: """Execute the help command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "-a", "--all", action="store_true", help="List all commands.", ) parsed_args = parser.parse_args(args) if parsed_args.all: logger.info("Available commands:") for cmd in sorted(commands): logger.info(" %s", cmd) else: logger.info( "The dulwich command line tool is currently a very basic frontend for the\n" "Dulwich python module. For full functionality, please see the API reference.\n" "\n" "For a list of supported commands, see 'dulwich help -a'." ) class cmd_format_patch(Command): """Prepare patches for e-mail submission.""" def run(self, args: Sequence[str]) -> None: """Execute the format-patch command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "committish", nargs="?", help="Commit or commit range (e.g., HEAD~3..HEAD or origin/master..HEAD)", ) parser.add_argument( "-n", "--numbered", type=int, default=1, help="Number of commits to format (default: 1)", ) parser.add_argument( "-o", "--output-directory", dest="outdir", help="Output directory for patches", ) parser.add_argument( "--stdout", action="store_true", help="Output patches to stdout", ) parsed_args = parser.parse_args(args) # Parse committish using the new function committish: ObjectID | tuple[ObjectID, ObjectID] | None = None if parsed_args.committish: with Repo(".") as r: range_result = parse_commit_range(r, parsed_args.committish) if range_result: # Convert Commit objects to their SHAs committish = (range_result[0].id, range_result[1].id) else: committish = ObjectID( parsed_args.committish.encode() if isinstance(parsed_args.committish, str) else parsed_args.committish ) filenames = porcelain.format_patch( ".", committish=committish, outstream=sys.stdout, outdir=parsed_args.outdir, n=parsed_args.numbered, stdout=parsed_args.stdout, ) if not parsed_args.stdout: for filename in filenames: logger.info(filename) class cmd_mailsplit(Command): """Split mbox or Maildir into individual message files.""" def run(self, args: Sequence[str]) -> None: """Execute the mailsplit command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "mbox", nargs="?", help="Path to mbox file or Maildir. If not specified, reads from stdin.", ) parser.add_argument( "-o", "--output-directory", dest="output_dir", required=True, help="Directory in which to place the individual messages", ) parser.add_argument( "-b", action="store_true", dest="single_mail", help="If any file doesn't begin with a From line, assume it is a single mail message", ) parser.add_argument( "-d", dest="precision", type=int, default=4, help="Number of digits for generated filenames (default: 4)", ) parser.add_argument( "-f", dest="start_number", type=int, default=1, help="Skip the first numbers (default: 1)", ) parser.add_argument( "--keep-cr", action="store_true", help="Do not remove \\r from lines ending with \\r\\n", ) parser.add_argument( "--mboxrd", action="store_true", help='Input is of the "mboxrd" format and "^>+From " line escaping is reversed', ) parsed_args = parser.parse_args(args) # Determine if input is a Maildir is_maildir = False if parsed_args.mbox: input_path = Path(parsed_args.mbox) if input_path.is_dir(): # Check if it's a Maildir (has cur, tmp, new subdirectories) if ( (input_path / "cur").exists() and (input_path / "tmp").exists() and (input_path / "new").exists() ): is_maildir = True else: input_path = None # Call porcelain function output_files = porcelain.mailsplit( input_path=input_path, output_dir=parsed_args.output_dir, start_number=parsed_args.start_number, precision=parsed_args.precision, keep_cr=parsed_args.keep_cr, mboxrd=parsed_args.mboxrd, is_maildir=is_maildir, ) # Print information about the split logger.info( "Split %d messages into %s", len(output_files), parsed_args.output_dir ) class cmd_mailinfo(Command): """Extract patch information from an email message.""" def run(self, args: Sequence[str]) -> None: """Execute the mailinfo command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument( "msg", help="File to write commit message", ) parser.add_argument( "patch", help="File to write patch content", ) parser.add_argument( "mail", nargs="?", help="Path to email file. If not specified, reads from stdin.", ) parser.add_argument( "-k", action="store_true", dest="keep_subject", help="Pass -k flag to git mailinfo (keeps [PATCH] and other subject tags)", ) parser.add_argument( "-b", action="store_true", dest="keep_non_patch", help="Pass -b flag to git mailinfo (only strip [PATCH] tags)", ) parser.add_argument( "--encoding", dest="encoding", help="Character encoding to use (default: detect from message)", ) parser.add_argument( "--scissors", action="store_true", help="Remove everything before scissors line", ) parser.add_argument( "-m", "--message-id", action="store_true", dest="message_id", help="Copy Message-ID to the end of the commit message", ) parsed_args = parser.parse_args(args) # Call porcelain function result = porcelain.mailinfo( input_path=parsed_args.mail, msg_file=parsed_args.msg, patch_file=parsed_args.patch, keep_subject=parsed_args.keep_subject, keep_non_patch=parsed_args.keep_non_patch, encoding=parsed_args.encoding, scissors=parsed_args.scissors, message_id=parsed_args.message_id, ) # Print author info to stdout (as git mailinfo does) print(f"Author: {result.author_name}") print(f"Email: {result.author_email}") print(f"Subject: {result.subject}") if result.author_date: print(f"Date: {result.author_date}") class cmd_bundle(Command): """Create, unpack, and manipulate bundle files.""" def run(self, args: Sequence[str]) -> int: """Execute the bundle command. Args: args: Command line arguments """ if not args: logger.error("Usage: bundle ") return 1 subcommand = args[0] subargs = args[1:] if subcommand == "create": return self._create(subargs) elif subcommand == "verify": return self._verify(subargs) elif subcommand == "list-heads": return self._list_heads(subargs) elif subcommand == "unbundle": return self._unbundle(subargs) else: logger.error("Unknown bundle subcommand: %s", subcommand) return 1 def _create(self, args: Sequence[str]) -> int: parser = argparse.ArgumentParser(prog="bundle create") parser.add_argument( "-q", "--quiet", action="store_true", help="Suppress progress" ) parser.add_argument("--progress", action="store_true", help="Show progress") parser.add_argument( "--version", type=int, choices=[2, 3], help="Bundle version" ) parser.add_argument("--all", action="store_true", help="Include all refs") parser.add_argument("--stdin", action="store_true", help="Read refs from stdin") parser.add_argument("file", help="Output bundle file (use - for stdout)") parser.add_argument("refs", nargs="*", help="References or rev-list args") parsed_args = parser.parse_args(args) repo = Repo(".") progress = None if parsed_args.progress and not parsed_args.quiet: def progress(*args: str | int) -> None: # Handle both progress(msg) and progress(count, msg) signatures if len(args) == 1: msg = args[0] elif len(args) == 2: _count, msg = args else: msg = str(args) # Convert bytes to string if needed if isinstance(msg, bytes): msg = msg.decode("utf-8", "replace") logger.error("%s", msg) refs_to_include: list[Ref] = [] prerequisites = [] if parsed_args.all: refs_to_include = list(repo.refs.keys()) elif parsed_args.stdin: for line in sys.stdin: ref = line.strip().encode("utf-8") if ref: refs_to_include.append(Ref(ref)) elif parsed_args.refs: for ref_arg in parsed_args.refs: if ".." in ref_arg: range_result = parse_commit_range(repo, ref_arg) if range_result: start_commit, _end_commit = range_result prerequisites.append(start_commit.id) # For ranges like A..B, we need to include B if it's a ref # Split the range to get the end part end_part = ref_arg.split("..")[1] if end_part: # Not empty (not "A..") end_ref = Ref(end_part.encode("utf-8")) if end_ref in repo.refs: refs_to_include.append(end_ref) else: sha = repo.refs[Ref(ref_arg.encode("utf-8"))] refs_to_include.append(Ref(ref_arg.encode("utf-8"))) else: if ref_arg.startswith("^"): sha = repo.refs[Ref(ref_arg[1:].encode("utf-8"))] prerequisites.append(sha) else: sha = repo.refs[Ref(ref_arg.encode("utf-8"))] refs_to_include.append(Ref(ref_arg.encode("utf-8"))) else: logger.error("No refs specified. Use --all, --stdin, or specify refs") return 1 if not refs_to_include: logger.error("fatal: Refusing to create empty bundle.") return 1 bundle = create_bundle_from_repo( repo, refs=refs_to_include, prerequisites=prerequisites, version=parsed_args.version, progress=progress, ) if parsed_args.file == "-": write_bundle(sys.stdout.buffer, bundle) else: with open(parsed_args.file, "wb") as f: write_bundle(f, bundle) return 0 def _verify(self, args: Sequence[str]) -> int: parser = argparse.ArgumentParser(prog="bundle verify") parser.add_argument( "-q", "--quiet", action="store_true", help="Suppress output" ) parser.add_argument("file", help="Bundle file to verify (use - for stdin)") parsed_args = parser.parse_args(args) repo = Repo(".") def verify_bundle(bundle: Bundle) -> int: missing_prereqs = [] for prereq_sha, comment in bundle.prerequisites: try: repo.object_store[prereq_sha] except KeyError: missing_prereqs.append(prereq_sha) if missing_prereqs: if not parsed_args.quiet: logger.info("The bundle requires these prerequisite commits:") for sha in missing_prereqs: logger.info(" %s", sha.decode()) return 1 else: if not parsed_args.quiet: logger.info( "The bundle is valid and can be applied to the current repository" ) return 0 if parsed_args.file == "-": bundle = read_bundle(sys.stdin.buffer) return verify_bundle(bundle) else: with open(parsed_args.file, "rb") as f: bundle = read_bundle(f) return verify_bundle(bundle) def _list_heads(self, args: Sequence[str]) -> int: parser = argparse.ArgumentParser(prog="bundle list-heads") parser.add_argument("file", help="Bundle file (use - for stdin)") parser.add_argument("refnames", nargs="*", help="Only show these refs") parsed_args = parser.parse_args(args) def list_heads(bundle: Bundle) -> None: for ref, sha in bundle.references.items(): if not parsed_args.refnames or ref.decode() in parsed_args.refnames: logger.info("%s %s", sha.decode(), ref.decode()) if parsed_args.file == "-": bundle = read_bundle(sys.stdin.buffer) list_heads(bundle) else: with open(parsed_args.file, "rb") as f: bundle = read_bundle(f) list_heads(bundle) return 0 def _unbundle(self, args: Sequence[str]) -> int: parser = argparse.ArgumentParser(prog="bundle unbundle") parser.add_argument("--progress", action="store_true", help="Show progress") parser.add_argument("file", help="Bundle file (use - for stdin)") parser.add_argument("refnames", nargs="*", help="Only unbundle these refs") parsed_args = parser.parse_args(args) repo = Repo(".") progress = None if parsed_args.progress: def progress(*args: str | int | bytes) -> None: # Handle both progress(msg) and progress(count, msg) signatures if len(args) == 1: msg = args[0] elif len(args) == 2: _count, msg = args else: msg = str(args) # Convert bytes to string if needed if isinstance(msg, bytes): msg = msg.decode("utf-8", "replace") elif not isinstance(msg, str): msg = str(msg) logger.error("%s", msg) if parsed_args.file == "-": bundle = read_bundle(sys.stdin.buffer) # Process the bundle while file is still available via stdin bundle.store_objects(repo.object_store, progress=progress) else: # Keep the file open during bundle processing with open(parsed_args.file, "rb") as f: bundle = read_bundle(f) # Process pack data while file is still open bundle.store_objects(repo.object_store, progress=progress) for ref, sha in bundle.references.items(): if not parsed_args.refnames or ref.decode() in parsed_args.refnames: logger.info(ref.decode()) return 0 class cmd_worktree_add(Command): """Create a new worktree.""" """Add a new worktree to the repository.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-add command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Add a new worktree", prog="dulwich worktree add" ) parser.add_argument("path", help="Path for the new worktree") parser.add_argument("committish", nargs="?", help="Commit-ish to checkout") parser.add_argument("-b", "--create-branch", help="Create a new branch") parser.add_argument( "-B", "--force-create-branch", help="Create or reset a branch" ) parser.add_argument( "--detach", action="store_true", help="Detach HEAD in new worktree" ) parser.add_argument("--force", action="store_true", help="Force creation") parsed_args = parser.parse_args(args) from dulwich import porcelain branch = None commit = None if parsed_args.create_branch or parsed_args.force_create_branch: branch = ( parsed_args.create_branch or parsed_args.force_create_branch ).encode() elif parsed_args.committish and not parsed_args.detach: # If committish is provided and not detaching, treat as branch branch = parsed_args.committish.encode() elif parsed_args.committish: # If committish is provided and detaching, treat as commit commit = parsed_args.committish.encode() worktree_path = porcelain.worktree_add( repo=".", path=parsed_args.path, branch=branch, commit=commit, detach=parsed_args.detach, force=parsed_args.force or bool(parsed_args.force_create_branch), ) logger.info("Worktree added: %s", worktree_path) return 0 class cmd_worktree_list(Command): """List worktrees.""" """List details of each worktree.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-list command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="List worktrees", prog="dulwich worktree list" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Show additional information" ) parser.add_argument( "--porcelain", action="store_true", help="Machine-readable output" ) parsed_args = parser.parse_args(args) from dulwich import porcelain worktrees = porcelain.worktree_list(repo=".") for wt in worktrees: path = wt.path if wt.bare: status = "(bare)" elif wt.detached: status = ( f"(detached HEAD {wt.head[:7].decode() if wt.head else 'unknown'})" ) elif wt.branch: branch_name = wt.branch.decode().replace("refs/heads/", "") status = f"[{branch_name}]" else: status = "(unknown)" if parsed_args.porcelain: locked = "locked" if wt.locked else "unlocked" prunable = "prunable" if wt.prunable else "unprunable" logger.info( "%s %s %s %s %s", path, wt.head.decode() if wt.head else "unknown", status, locked, prunable, ) else: line = f"{path} {status}" if wt.locked: line += " locked" if wt.prunable: line += " prunable" logger.info(line) return 0 class cmd_worktree_remove(Command): """Remove a worktree.""" """Remove a worktree.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-remove command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Remove a worktree", prog="dulwich worktree remove" ) parser.add_argument("worktree", help="Path to worktree to remove") parser.add_argument("--force", action="store_true", help="Force removal") parsed_args = parser.parse_args(args) from dulwich import porcelain porcelain.worktree_remove( repo=".", path=parsed_args.worktree, force=parsed_args.force ) logger.info("Worktree removed: %s", parsed_args.worktree) return 0 class cmd_worktree_prune(Command): """Prune worktree information.""" """Prune worktree information.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-prune command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Prune worktree information", prog="dulwich worktree prune" ) parser.add_argument( "--dry-run", action="store_true", help="Do not remove anything" ) parser.add_argument( "-v", "--verbose", action="store_true", help="Report all removals" ) parser.add_argument( "--expire", type=int, help="Expire worktrees older than time (seconds)" ) parsed_args = parser.parse_args(args) from dulwich import porcelain pruned = porcelain.worktree_prune( repo=".", dry_run=parsed_args.dry_run, expire=parsed_args.expire ) if pruned: if parsed_args.dry_run: logger.info("Would prune worktrees:") elif parsed_args.verbose: logger.info("Pruned worktrees:") for wt_id in pruned: logger.info(" %s", wt_id) elif parsed_args.verbose: logger.info("No worktrees to prune") return 0 class cmd_worktree_lock(Command): """Lock a worktree to prevent it from being pruned.""" """Lock a worktree.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-lock command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Lock a worktree", prog="dulwich worktree lock" ) parser.add_argument("worktree", help="Path to worktree to lock") parser.add_argument("--reason", help="Reason for locking") parsed_args = parser.parse_args(args) from dulwich import porcelain porcelain.worktree_lock( repo=".", path=parsed_args.worktree, reason=parsed_args.reason ) logger.info("Worktree locked: %s", parsed_args.worktree) return 0 class cmd_worktree_unlock(Command): """Unlock a locked worktree.""" """Unlock a worktree.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-unlock command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Unlock a worktree", prog="dulwich worktree unlock" ) parser.add_argument("worktree", help="Path to worktree to unlock") parsed_args = parser.parse_args(args) from dulwich import porcelain porcelain.worktree_unlock(repo=".", path=parsed_args.worktree) logger.info("Worktree unlocked: %s", parsed_args.worktree) return 0 class cmd_worktree_move(Command): """Move a worktree to a new location.""" """Move a worktree.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-move command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Move a worktree", prog="dulwich worktree move" ) parser.add_argument("worktree", help="Path to worktree to move") parser.add_argument("new_path", help="New path for the worktree") parsed_args = parser.parse_args(args) from dulwich import porcelain porcelain.worktree_move( repo=".", old_path=parsed_args.worktree, new_path=parsed_args.new_path ) logger.info( "Worktree moved: %s -> %s", parsed_args.worktree, parsed_args.new_path ) return 0 class cmd_worktree_repair(Command): """Repair worktree administrative files.""" """Repair worktree administrative files.""" def run(self, args: Sequence[str]) -> int | None: """Execute the worktree-repair command. Args: args: Command line arguments """ parser = argparse.ArgumentParser( description="Repair worktree administrative files", prog="dulwich worktree repair", ) parser.add_argument( "path", nargs="*", help="Paths to worktrees to repair (if not specified, repairs all)", ) parsed_args = parser.parse_args(args) from dulwich import porcelain paths = parsed_args.path if parsed_args.path else None repaired = porcelain.worktree_repair(repo=".", paths=paths) if repaired: for path in repaired: logger.info("Repaired worktree: %s", path) else: logger.info("No worktrees needed repair") return 0 class cmd_worktree(SuperCommand): """Manage multiple working trees.""" """Manage multiple working trees.""" subcommands: ClassVar[dict[str, type[Command]]] = { "add": cmd_worktree_add, "list": cmd_worktree_list, "remove": cmd_worktree_remove, "prune": cmd_worktree_prune, "lock": cmd_worktree_lock, "unlock": cmd_worktree_unlock, "move": cmd_worktree_move, "repair": cmd_worktree_repair, } default_command = cmd_worktree_list class cmd_rerere(Command): """Record and reuse recorded conflict resolutions.""" def run(self, args: Sequence[str]) -> None: """Execute the rerere command. Args: args: Command line arguments """ parser = argparse.ArgumentParser() parser.add_argument("gitdir", nargs="?", default=".", help="Git directory") parser.add_argument( "subcommand", nargs="?", default=None, choices=["status", "diff", "forget", "clear", "gc"], help="Subcommand to execute (default: record conflicts)", ) parser.add_argument( "pathspec", nargs="?", help="Path specification (for forget subcommand)" ) parser.add_argument( "--max-age-days", type=int, default=60, help="Maximum age in days for gc (default: 60)", ) parsed_args = parser.parse_args(args) if parsed_args.subcommand is None: # Record current conflicts recorded, resolved = porcelain.rerere(parsed_args.gitdir) if not recorded: sys.stdout.write("No conflicts to record.\n") else: for path, conflict_id in recorded: sys.stdout.write( f"Recorded resolution for {path.decode('utf-8')}: {conflict_id}\n" ) if resolved: sys.stdout.write("\nAutomatically resolved:\n") for path in resolved: sys.stdout.write(f" {path.decode('utf-8')}\n") elif parsed_args.subcommand == "status": status_list = porcelain.rerere_status(parsed_args.gitdir) if not status_list: sys.stdout.write("No recorded resolutions.\n") else: for conflict_id, has_resolution in status_list: status = "resolved" if has_resolution else "unresolved" sys.stdout.write(f"{conflict_id}\t{status}\n") elif parsed_args.subcommand == "diff": diff_list = porcelain.rerere_diff(parsed_args.gitdir) if not diff_list: sys.stdout.write("No recorded conflicts.\n") else: for conflict_id, preimage, postimage in diff_list: sys.stdout.write(f"--- {conflict_id} (preimage)\n") sys.stdout.buffer.write(preimage) sys.stdout.write("\n") if postimage: sys.stdout.write(f"+++ {conflict_id} (postimage)\n") sys.stdout.buffer.write(postimage) sys.stdout.write("\n") elif parsed_args.subcommand == "forget": porcelain.rerere_forget(parsed_args.gitdir, parsed_args.pathspec) if parsed_args.pathspec: sys.stdout.write(f"Forgot resolution for {parsed_args.pathspec}\n") else: sys.stdout.write("Forgot all resolutions\n") elif parsed_args.subcommand == "clear": porcelain.rerere_clear(parsed_args.gitdir) sys.stdout.write("Cleared all rerere resolutions\n") elif parsed_args.subcommand == "gc": porcelain.rerere_gc(parsed_args.gitdir, parsed_args.max_age_days) sys.stdout.write( f"Cleaned up resolutions older than {parsed_args.max_age_days} days\n" ) commands = { "add": cmd_add, "annotate": cmd_annotate, "archive": cmd_archive, "bisect": cmd_bisect, "blame": cmd_blame, "branch": cmd_branch, "bundle": cmd_bundle, "check-ignore": cmd_check_ignore, "check-mailmap": cmd_check_mailmap, "checkout": cmd_checkout, "cherry": cmd_cherry, "cherry-pick": cmd_cherry_pick, "clone": cmd_clone, "column": cmd_column, "commit": cmd_commit, "commit-tree": cmd_commit_tree, "config": cmd_config, "count-objects": cmd_count_objects, "describe": cmd_describe, "diagnose": cmd_diagnose, "daemon": cmd_daemon, "diff": cmd_diff, "diff-tree": cmd_diff_tree, "dump-pack": cmd_dump_pack, "dump-index": cmd_dump_index, "fetch-pack": cmd_fetch_pack, "fetch": cmd_fetch, "filter-branch": cmd_filter_branch, "for-each-ref": cmd_for_each_ref, "format-patch": cmd_format_patch, "fsck": cmd_fsck, "gc": cmd_gc, "grep": cmd_grep, "help": cmd_help, "init": cmd_init, "interpret-trailers": cmd_interpret_trailers, "lfs": cmd_lfs, "log": cmd_log, "ls-files": cmd_ls_files, "ls-remote": cmd_ls_remote, "ls-tree": cmd_ls_tree, "maintenance": cmd_maintenance, "mailinfo": cmd_mailinfo, "mailsplit": cmd_mailsplit, "merge": cmd_merge, "merge-base": cmd_merge_base, "merge-tree": cmd_merge_tree, "notes": cmd_notes, "pack-objects": cmd_pack_objects, "pack-refs": cmd_pack_refs, "prune": cmd_prune, "pull": cmd_pull, "push": cmd_push, "rebase": cmd_rebase, "receive-pack": cmd_receive_pack, "reflog": cmd_reflog, "rerere": cmd_rerere, "remote": cmd_remote, "repack": cmd_repack, "replace": cmd_replace, "reset": cmd_reset, "restore": cmd_restore, "revert": cmd_revert, "rev-list": cmd_rev_list, "rm": cmd_rm, "mv": cmd_mv, "show": cmd_show, "show-branch": cmd_show_branch, "show-ref": cmd_show_ref, "stash": cmd_stash, "status": cmd_status, "stripspace": cmd_stripspace, "shortlog": cmd_shortlog, "switch": cmd_switch, "symbolic-ref": cmd_symbolic_ref, "submodule": cmd_submodule, "tag": cmd_tag, "unpack-objects": cmd_unpack_objects, "update-server-info": cmd_update_server_info, "upload-pack": cmd_upload_pack, "var": cmd_var, "verify-commit": cmd_verify_commit, "verify-tag": cmd_verify_tag, "web-daemon": cmd_web_daemon, "worktree": cmd_worktree, "write-tree": cmd_write_tree, } def main(argv: Sequence[str] | None = None) -> int | None: """Main entry point for the Dulwich CLI. Args: argv: Command line arguments (defaults to sys.argv[1:]) Returns: Exit code or None """ # Wrap stdout and stderr to respect GIT_FLUSH environment variable sys.stdout = AutoFlushTextIOWrapper.env(sys.stdout) sys.stderr = AutoFlushTextIOWrapper.env(sys.stderr) if argv is None: argv = sys.argv[1:] # Parse only the global options and command, stop at first positional parser = argparse.ArgumentParser( prog="dulwich", description="Simple command-line interface to Dulwich", add_help=False, # We'll handle help ourselves ) parser.add_argument("--no-pager", action="store_true", help="Disable pager") parser.add_argument("--pager", action="store_true", help="Force enable pager") parser.add_argument("--help", "-h", action="store_true", help="Show help") # Parse known args to separate global options from command args global_args, remaining = parser.parse_known_args(argv) # Apply global pager settings if global_args.no_pager: disable_pager() elif global_args.pager: enable_pager() # Handle help if global_args.help or not remaining: parser = argparse.ArgumentParser( prog="dulwich", description="Simple command-line interface to Dulwich" ) parser.add_argument("--no-pager", action="store_true", help="Disable pager") parser.add_argument("--pager", action="store_true", help="Force enable pager") parser.add_argument( "command", nargs="?", help=f"Command to run. Available: {', '.join(sorted(commands.keys()))}", ) parser.print_help() return 1 # Try to configure from GIT_TRACE, fall back to default if it fails if not _configure_logging_from_trace(): logging.basicConfig( level=logging.INFO, format="%(message)s", ) # First remaining arg is the command cmd = remaining[0] cmd_args = remaining[1:] try: cmd_kls = commands[cmd] except KeyError: logging.fatal("No such subcommand: %s", cmd) return 1 # TODO(jelmer): Return non-0 on errors return cmd_kls().run(cmd_args) def _main() -> None: if "DULWICH_PDB" in os.environ and getattr(signal, "SIGQUIT", None): signal.signal(signal.SIGQUIT, signal_quit) # type: ignore[attr-defined,unused-ignore] signal.signal(signal.SIGINT, signal_int) sys.exit(main()) if __name__ == "__main__": _main() dulwich-1.0.0/dulwich/client.py000066400000000000000000005366151513301442600164550ustar00rootroot00000000000000# client.py -- Implementation of the client side git protocols # Copyright (C) 2008-2013 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Client side support for the Git protocol. The Dulwich client supports the following capabilities: * thin-pack * multi_ack_detailed * multi_ack * side-band-64k * ofs-delta * quiet * report-status * delete-refs * shallow Known capabilities that are not supported: * no-progress * include-tag """ __all__ = [ "COMMON_CAPABILITIES", "DEFAULT_GIT_CREDENTIALS_PATHS", "DEFAULT_REF_PREFIX", "RECEIVE_CAPABILITIES", "UPLOAD_CAPABILITIES", "AbstractHttpGitClient", "BundleClient", "FetchPackResult", "GitClient", "HTTPProxyUnauthorized", "HTTPUnauthorized", "InvalidWants", "LocalGitClient", "LsRemoteResult", "PLinkSSHVendor", "ReportStatusParser", "SSHGitClient", "SSHVendor", "SendPackResult", "StrangeHostname", "SubprocessGitClient", "SubprocessSSHVendor", "SubprocessWrapper", "TCPGitClient", "TraditionalGitClient", "Urllib3HttpGitClient", "check_for_proxy_bypass", "check_wants", "default_urllib3_manager", "default_user_agent_string", "find_capability", "find_git_command", "get_credentials_from_store", "get_transport_and_path", "get_transport_and_path_from_url", "negotiate_protocol_version", "parse_rsync_url", "read_pkt_refs_v1", "read_pkt_refs_v2", "read_server_capabilities", ] import copy import functools import logging import os import select import socket import subprocess import sys from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Set from contextlib import closing from io import BufferedReader, BytesIO from typing import ( IO, TYPE_CHECKING, Any, ClassVar, ) from urllib.parse import ParseResult, urljoin, urlparse, urlunparse, urlunsplit from urllib.parse import quote as urlquote if TYPE_CHECKING: import urllib3 import dulwich if TYPE_CHECKING: from collections.abc import Mapping from typing import Protocol as TypingProtocol from .objects import ObjectID from .pack import UnpackedObject from .refs import Ref class HTTPResponse(TypingProtocol): """Protocol for HTTP response objects (matches urllib3.response.HTTPResponse).""" status: int headers: Mapping[str, str] content_type: str | None redirect_location: str def close(self) -> None: ... def read(self, amt: int | None = None) -> bytes: ... def geturl(self) -> str | None: ... class GeneratePackDataFunc(TypingProtocol): """Protocol for generate_pack_data functions.""" def __call__( self, have: Set[ObjectID], want: Set[ObjectID], *, ofs_delta: bool = False, progress: Callable[[bytes], None] | None = None, ) -> tuple[int, Iterator[UnpackedObject]]: """Generate pack data for the given have and want sets.""" ... class DetermineWantsFunc(TypingProtocol): """Protocol for determine_wants functions.""" def __call__( self, refs: Mapping[Ref, ObjectID], depth: int | None = None, ) -> list[ObjectID]: """Determine the objects to fetch from the given refs.""" ... from .bundle import Bundle from .config import Config, apply_instead_of, get_xdg_config_home_path from .credentials import match_partial_url, match_urls from .errors import GitProtocolError, HangupException, NotGitRepository, SendPackError from .object_format import DEFAULT_OBJECT_FORMAT from .object_store import GraphWalker from .objects import ObjectID from .pack import ( PACK_SPOOL_FILE_MAX_SIZE, PackChunkGenerator, PackData, write_pack_from_container, ) from .protocol import ( _RBUFSIZE, CAPABILITIES_REF, CAPABILITY_AGENT, CAPABILITY_DELETE_REFS, CAPABILITY_FETCH, CAPABILITY_FILTER, CAPABILITY_INCLUDE_TAG, CAPABILITY_MULTI_ACK, CAPABILITY_MULTI_ACK_DETAILED, CAPABILITY_OFS_DELTA, CAPABILITY_QUIET, CAPABILITY_REPORT_STATUS, CAPABILITY_SHALLOW, CAPABILITY_SIDE_BAND_64K, CAPABILITY_SYMREF, CAPABILITY_THIN_PACK, COMMAND_DEEPEN, COMMAND_DEEPEN_NOT, COMMAND_DEEPEN_SINCE, COMMAND_DONE, COMMAND_HAVE, COMMAND_SHALLOW, COMMAND_UNSHALLOW, COMMAND_WANT, DEFAULT_GIT_PROTOCOL_VERSION_FETCH, DEFAULT_GIT_PROTOCOL_VERSION_SEND, GIT_PROTOCOL_VERSIONS, KNOWN_RECEIVE_CAPABILITIES, KNOWN_UPLOAD_CAPABILITIES, PEELED_TAG_SUFFIX, SIDE_BAND_CHANNEL_DATA, SIDE_BAND_CHANNEL_FATAL, SIDE_BAND_CHANNEL_PROGRESS, TCP_GIT_PORT, ZERO_SHA, PktLineParser, Protocol, agent_string, capability_agent, extract_capabilities, extract_capability_names, parse_capability, pkt_line, pkt_seq, split_peeled_refs, ) from .refs import ( HEADREF, SYMREF, Ref, _import_remote_refs, _set_default_branch, _set_head, _set_origin_head, filter_ref_prefix, read_info_refs, ) from .repo import BaseRepo, Repo # Default ref prefix, used if none is specified. # GitHub defaults to just sending HEAD if no ref-prefix is # specified, so explicitly request all refs to match # behaviour with v1 when no ref-prefix is specified. DEFAULT_REF_PREFIX = [b"HEAD", b"refs/"] logger = logging.getLogger(__name__) class InvalidWants(Exception): """Invalid wants.""" def __init__(self, wants: Set[bytes]) -> None: """Initialize InvalidWants exception. Args: wants: List of invalid wants """ Exception.__init__( self, f"requested wants not in server provided refs: {wants!r}" ) class HTTPUnauthorized(Exception): """Raised when authentication fails.""" def __init__(self, www_authenticate: str | None, url: str) -> None: """Initialize HTTPUnauthorized exception. Args: www_authenticate: WWW-Authenticate header value url: URL that requires authentication """ Exception.__init__(self, "No valid credentials provided") self.www_authenticate = www_authenticate self.url = url def _to_optional_dict(refs: Mapping[Ref, ObjectID]) -> dict[Ref, ObjectID | None]: """Convert a dict[Ref, ObjectID] to dict[Ref, Optional[ObjectID]]. This is needed for compatibility with result types that expect Optional values. """ return {k: v for k, v in refs.items()} class HTTPProxyUnauthorized(Exception): """Raised when proxy authentication fails.""" def __init__(self, proxy_authenticate: str | None, url: str) -> None: """Initialize HTTPProxyUnauthorized exception. Args: proxy_authenticate: Proxy-Authenticate header value url: URL that requires proxy authentication """ Exception.__init__(self, "No valid proxy credentials provided") self.proxy_authenticate = proxy_authenticate self.url = url def _fileno_can_read(fileno: int) -> bool: """Check if a file descriptor is readable.""" return len(select.select([fileno], [], [], 0)[0]) > 0 def _win32_peek_avail(handle: int) -> int: """Wrapper around PeekNamedPipe to check how many bytes are available.""" from ctypes import ( # type: ignore[attr-defined,unused-ignore] byref, windll, wintypes, ) c_avail = wintypes.DWORD() c_message = wintypes.DWORD() success = windll.kernel32.PeekNamedPipe( handle, None, 0, None, byref(c_avail), byref(c_message) ) if not success: from ctypes import GetLastError # type: ignore[attr-defined,unused-ignore] raise OSError(GetLastError()) return c_avail.value COMMON_CAPABILITIES = [CAPABILITY_OFS_DELTA, CAPABILITY_SIDE_BAND_64K] UPLOAD_CAPABILITIES = [ CAPABILITY_THIN_PACK, CAPABILITY_MULTI_ACK, CAPABILITY_MULTI_ACK_DETAILED, CAPABILITY_SHALLOW, *COMMON_CAPABILITIES, ] RECEIVE_CAPABILITIES = [ CAPABILITY_REPORT_STATUS, CAPABILITY_DELETE_REFS, *COMMON_CAPABILITIES, ] class ReportStatusParser: """Handle status as reported by servers with 'report-status' capability.""" def __init__(self) -> None: """Initialize ReportStatusParser.""" self._done = False self._pack_status: bytes | None = None self._ref_statuses: list[bytes] = [] def check(self) -> Iterator[tuple[bytes, str | None]]: """Check if there were any errors and, if so, raise exceptions. Raises: SendPackError: Raised when the server could not unpack Returns: iterator over refs """ if self._pack_status not in (b"unpack ok", None): raise SendPackError(self._pack_status) for status in self._ref_statuses: try: status, rest = status.split(b" ", 1) except ValueError: # malformed response, move on to the next one continue if status == b"ng": ref, error = rest.split(b" ", 1) yield ref, error.decode("utf-8") elif status == b"ok": yield rest, None else: raise GitProtocolError(f"invalid ref status {status!r}") def handle_packet(self, pkt: bytes | None) -> None: """Handle a packet. Raises: GitProtocolError: Raised when packets are received after a flush packet. """ if self._done: raise GitProtocolError("received more data after status report") if pkt is None: self._done = True return if self._pack_status is None: self._pack_status = pkt.strip() else: ref_status = pkt.strip() self._ref_statuses.append(ref_status) def negotiate_protocol_version(proto: Protocol) -> int: """Negotiate protocol version with the server.""" pkt = proto.read_pkt_line() if pkt is not None and pkt.strip() == b"version 2": return 2 proto.unread_pkt_line(pkt) return 0 def read_server_capabilities(pkt_seq: Iterable[bytes]) -> set[bytes]: """Read server capabilities from packet sequence.""" server_capabilities = [] for pkt in pkt_seq: server_capabilities.append(pkt) return set(server_capabilities) def extract_object_format_from_capabilities( capabilities: set[bytes], ) -> str | None: """Extract object format from server capabilities. Args: capabilities: Server capabilities Returns: Object format name as string (e.g., "sha1", "sha256"), or None if not specified """ for capability in capabilities: k, v = parse_capability(capability) if k == b"object-format" and v is not None: return v.decode("ascii").strip() return None def read_pkt_refs_v2( pkt_seq: Iterable[bytes], ) -> tuple[dict[Ref, ObjectID | None], dict[Ref, Ref], dict[Ref, ObjectID]]: """Read references using protocol version 2.""" refs: dict[Ref, ObjectID | None] = {} symrefs: dict[Ref, Ref] = {} peeled: dict[Ref, ObjectID] = {} # Receive refs from server for pkt in pkt_seq: parts = pkt.rstrip(b"\n").split(b" ") sha_bytes = parts[0] sha: ObjectID | None if sha_bytes == b"unborn": sha = None else: sha = ObjectID(sha_bytes) ref = Ref(parts[1]) for part in parts[2:]: if part.startswith(b"peeled:"): peeled[ref] = ObjectID(part[7:]) elif part.startswith(b"symref-target:"): symrefs[ref] = Ref(part[14:]) else: logging.warning("unknown part in pkt-ref: %s", part) refs[ref] = sha return refs, symrefs, peeled def read_pkt_refs_v1( pkt_seq: Iterable[bytes], ) -> tuple[dict[Ref, ObjectID], set[bytes]]: """Read references using protocol version 1.""" server_capabilities = None refs: dict[Ref, ObjectID] = {} # Receive refs from server for pkt in pkt_seq: (sha, ref) = pkt.rstrip(b"\n").split(None, 1) if sha == b"ERR": raise GitProtocolError(ref.decode("utf-8", "replace")) if server_capabilities is None: (ref, server_capabilities) = extract_capabilities(ref) refs[Ref(ref)] = ObjectID(sha) if len(refs) == 0: return {}, set() if refs == {CAPABILITIES_REF: ZERO_SHA}: refs = {} assert server_capabilities is not None return refs, set(server_capabilities) class _DeprecatedDictProxy: """Base class for result objects that provide deprecated dict-like interface.""" refs: dict[Ref, ObjectID | None] # To be overridden by subclasses _FORWARDED_ATTRS: ClassVar[set[str]] = { "clear", "copy", "fromkeys", "get", "items", "keys", "pop", "popitem", "setdefault", "update", "values", "viewitems", "viewkeys", "viewvalues", } def _warn_deprecated(self) -> None: import warnings warnings.warn( f"Use {self.__class__.__name__}.refs instead.", DeprecationWarning, stacklevel=3, ) def __contains__(self, name: Ref) -> bool: self._warn_deprecated() return name in self.refs def __getitem__(self, name: Ref) -> ObjectID | None: self._warn_deprecated() return self.refs[name] def __len__(self) -> int: self._warn_deprecated() return len(self.refs) def __iter__(self) -> Iterator[Ref]: self._warn_deprecated() return iter(self.refs) def __getattribute__(self, name: str) -> object: # Avoid infinite recursion by checking against class variable directly if name != "_FORWARDED_ATTRS" and name in type(self)._FORWARDED_ATTRS: self._warn_deprecated() # Direct attribute access to avoid recursion refs = object.__getattribute__(self, "refs") return getattr(refs, name) return super().__getattribute__(name) class FetchPackResult(_DeprecatedDictProxy): """Result of a fetch-pack operation. Attributes: refs: Dictionary with all remote refs symrefs: Dictionary with remote symrefs agent: User agent string object_format: Object format name (e.g., "sha1", "sha256") used by the remote, or None if not specified """ refs: dict[Ref, ObjectID | None] symrefs: dict[Ref, Ref] agent: bytes | None object_format: str | None def __init__( self, refs: dict[Ref, ObjectID | None], symrefs: dict[Ref, Ref], agent: bytes | None, new_shallow: set[ObjectID] | None = None, new_unshallow: set[ObjectID] | None = None, object_format: str | None = None, ) -> None: """Initialize FetchPackResult. Args: refs: Dictionary with all remote refs symrefs: Dictionary with remote symrefs agent: User agent string new_shallow: New shallow commits new_unshallow: New unshallow commits object_format: Object format name (e.g., "sha1", "sha256") used by the remote """ self.refs = refs self.symrefs = symrefs self.agent = agent self.new_shallow = new_shallow self.new_unshallow = new_unshallow self.object_format = object_format def __eq__(self, other: object) -> bool: """Check equality with another object.""" if isinstance(other, dict): self._warn_deprecated() return self.refs == other if not isinstance(other, FetchPackResult): return False return ( self.refs == other.refs and self.symrefs == other.symrefs and self.agent == other.agent ) def __repr__(self) -> str: """Return string representation of FetchPackResult.""" return f"{self.__class__.__name__}({self.refs!r}, {self.symrefs!r}, {self.agent!r})" class LsRemoteResult(_DeprecatedDictProxy): """Result of a ls-remote operation. Attributes: refs: Dictionary with all remote refs symrefs: Dictionary with remote symrefs object_format: Object format name (e.g., "sha1", "sha256") used by the remote, or None if not specified """ symrefs: dict[Ref, Ref] object_format: str | None def __init__( self, refs: dict[Ref, ObjectID | None], symrefs: dict[Ref, Ref], object_format: str | None = None, ) -> None: """Initialize LsRemoteResult. Args: refs: Dictionary with all remote refs symrefs: Dictionary with remote symrefs object_format: Object format name (e.g., "sha1", "sha256") used by the remote """ self.refs = refs self.symrefs = symrefs self.object_format = object_format def _warn_deprecated(self) -> None: import warnings warnings.warn( "Treating LsRemoteResult as a dictionary is deprecated. " "Use result.refs instead.", DeprecationWarning, stacklevel=3, ) def __eq__(self, other: object) -> bool: """Check equality with another object.""" if isinstance(other, dict): self._warn_deprecated() return self.refs == other if not isinstance(other, LsRemoteResult): return False return self.refs == other.refs and self.symrefs == other.symrefs def __repr__(self) -> str: """Return string representation of LsRemoteResult.""" return f"{self.__class__.__name__}({self.refs!r}, {self.symrefs!r})" class SendPackResult(_DeprecatedDictProxy): """Result of a upload-pack operation. Attributes: refs: Dictionary with all remote refs agent: User agent string ref_status: Optional dictionary mapping ref name to error message (if it failed to update), or None if it was updated successfully """ def __init__( self, refs: dict[Ref, ObjectID | None], agent: bytes | None = None, ref_status: dict[bytes, str | None] | None = None, ) -> None: """Initialize SendPackResult. Args: refs: Dictionary with all remote refs agent: User agent string ref_status: Optional dictionary mapping ref name to error message """ self.refs = refs self.agent = agent self.ref_status = ref_status def __eq__(self, other: object) -> bool: """Check equality with another object.""" if isinstance(other, dict): self._warn_deprecated() return self.refs == other if not isinstance(other, SendPackResult): return False return self.refs == other.refs and self.agent == other.agent def __repr__(self) -> str: """Return string representation of SendPackResult.""" return f"{self.__class__.__name__}({self.refs!r}, {self.agent!r})" def _read_shallow_updates( pkt_seq: Iterable[bytes], ) -> tuple[set[ObjectID], set[ObjectID]]: new_shallow: set[ObjectID] = set() new_unshallow: set[ObjectID] = set() for pkt in pkt_seq: if pkt == b"shallow-info\n": # Git-protocol v2 continue try: cmd, sha = pkt.split(b" ", 1) except ValueError: raise GitProtocolError(f"unknown command {pkt!r}") if cmd == COMMAND_SHALLOW: new_shallow.add(ObjectID(sha.strip())) elif cmd == COMMAND_UNSHALLOW: new_unshallow.add(ObjectID(sha.strip())) else: raise GitProtocolError(f"unknown command {pkt!r}") return (new_shallow, new_unshallow) class _v1ReceivePackHeader: def __init__( self, capabilities: Sequence[bytes], old_refs: Mapping[Ref, ObjectID], new_refs: Mapping[Ref, ObjectID], ) -> None: self.want: set[ObjectID] = set() self.have: set[ObjectID] = set() self._it = self._handle_receive_pack_head(capabilities, old_refs, new_refs) self.sent_capabilities = False def __iter__(self) -> Iterator[bytes | None]: return self._it def _handle_receive_pack_head( self, capabilities: Sequence[bytes], old_refs: Mapping[Ref, ObjectID], new_refs: Mapping[Ref, ObjectID], ) -> Iterator[bytes | None]: """Handle the head of a 'git-receive-pack' request. Args: capabilities: List of negotiated capabilities old_refs: Old refs, as received from the server new_refs: Refs to change Returns: (have, want) tuple """ self.have = {x for x in old_refs.values() if not x == ZERO_SHA} for refname in new_refs: if not isinstance(refname, bytes): raise TypeError(f"refname is not a bytestring: {refname!r}") old_sha1 = old_refs.get(refname, ZERO_SHA) if not isinstance(old_sha1, bytes): raise TypeError( f"old sha1 for {refname!r} is not a bytestring: {old_sha1!r}" ) new_sha1 = new_refs.get(refname, ZERO_SHA) if not isinstance(new_sha1, bytes): raise TypeError( f"old sha1 for {refname!r} is not a bytestring {new_sha1!r}" ) if old_sha1 != new_sha1: logger.debug( "Sending updated ref %r: %r -> %r", refname, old_sha1, new_sha1 ) if self.sent_capabilities: yield old_sha1 + b" " + new_sha1 + b" " + refname else: yield ( old_sha1 + b" " + new_sha1 + b" " + refname + b"\0" + b" ".join(sorted(capabilities)) ) self.sent_capabilities = True if new_sha1 not in self.have and new_sha1 != ZERO_SHA: self.want.add(new_sha1) yield None def _read_side_band64k_data(pkt_seq: Iterable[bytes]) -> Iterator[tuple[int, bytes]]: """Read per-channel data. This requires the side-band-64k capability. Args: pkt_seq: Sequence of packets to read """ for pkt in pkt_seq: channel = ord(pkt[:1]) yield channel, pkt[1:] def find_capability( capabilities: Iterable[bytes], key: bytes, value: bytes | None ) -> bytes | None: """Find a capability with a specific key and value.""" for capability in capabilities: k, v = parse_capability(capability) if k != key: continue if value and v and value not in v.split(b" "): continue return capability return None def _handle_upload_pack_head( proto: Protocol, capabilities: Iterable[bytes], graph_walker: GraphWalker, wants: list[ObjectID], can_read: Callable[[], bool] | None, depth: int | None, protocol_version: int | None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> tuple[set[ObjectID] | None, set[ObjectID] | None]: """Handle the head of a 'git-upload-pack' request. Args: proto: Protocol object to read from capabilities: List of negotiated capabilities graph_walker: GraphWalker instance to call .ack() on wants: List of commits to fetch can_read: function that returns a boolean that indicates whether there is extra graph data to read on proto depth: Depth for request protocol_version: Neogiated Git protocol version. shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs """ new_shallow: set[ObjectID] | None new_unshallow: set[ObjectID] | None assert isinstance(wants, list) and isinstance(wants[0], bytes) wantcmd = COMMAND_WANT + b" " + wants[0] if protocol_version is None: protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_SEND if protocol_version != 2: wantcmd += b" " + b" ".join(sorted(capabilities)) wantcmd += b"\n" proto.write_pkt_line(wantcmd) for want in wants[1:]: proto.write_pkt_line(COMMAND_WANT + b" " + want + b"\n") if ( depth not in (0, None) or shallow_since is not None or shallow_exclude or (hasattr(graph_walker, "shallow") and graph_walker.shallow) ): if protocol_version == 2: if not find_capability(capabilities, CAPABILITY_FETCH, CAPABILITY_SHALLOW): raise GitProtocolError( "server does not support shallow capability required for depth" ) elif CAPABILITY_SHALLOW not in capabilities: raise GitProtocolError( "server does not support shallow capability required for depth" ) if hasattr(graph_walker, "shallow"): for sha in graph_walker.shallow: proto.write_pkt_line(COMMAND_SHALLOW + b" " + sha + b"\n") if depth is not None: proto.write_pkt_line( COMMAND_DEEPEN + b" " + str(depth).encode("ascii") + b"\n" ) if shallow_since is not None: proto.write_pkt_line( COMMAND_DEEPEN_SINCE + b" " + shallow_since.encode("ascii") + b"\n" ) if shallow_exclude: for ref in shallow_exclude: proto.write_pkt_line( COMMAND_DEEPEN_NOT + b" " + ref.encode("ascii") + b"\n" ) if protocol_version != 2: proto.write_pkt_line(None) have = next(graph_walker) while have: proto.write_pkt_line(COMMAND_HAVE + b" " + have + b"\n") if can_read is not None and can_read(): pkt = proto.read_pkt_line() assert pkt is not None parts = pkt.rstrip(b"\n").split(b" ") if parts[0] == b"ACK": graph_walker.ack(ObjectID(parts[1])) if parts[2] in (b"continue", b"common"): pass elif parts[2] == b"ready": break else: raise AssertionError( f"{parts[2]!r} not in ('continue', 'ready', 'common)" ) have = next(graph_walker) proto.write_pkt_line(COMMAND_DONE + b"\n") if protocol_version == 2: proto.write_pkt_line(None) if depth not in (0, None) or shallow_since is not None or shallow_exclude: if can_read is not None: (new_shallow, new_unshallow) = _read_shallow_updates(proto.read_pkt_seq()) else: new_shallow = None new_unshallow = None else: new_shallow = new_unshallow = set[ObjectID]() return (new_shallow, new_unshallow) def _handle_upload_pack_tail( proto: "Protocol", capabilities: Set[bytes], graph_walker: "GraphWalker", pack_data: Callable[[bytes], int], progress: Callable[[bytes], None] | None = None, rbufsize: int = _RBUFSIZE, protocol_version: int = 0, ) -> None: """Handle the tail of a 'git-upload-pack' request. Args: proto: Protocol object to read from capabilities: List of negotiated capabilities graph_walker: GraphWalker instance to call .ack() on pack_data: Function to call with pack data progress: Optional progress reporting function rbufsize: Read buffer size protocol_version: Neogiated Git protocol version. """ pkt = proto.read_pkt_line() while pkt: parts = pkt.rstrip(b"\n").split(b" ") if protocol_version == 2 and parts[0] != b"packfile": break else: if parts[0] == b"ACK": graph_walker.ack(ObjectID(parts[1])) if parts[0] == b"NAK": graph_walker.nak() if len(parts) < 3 or parts[2] not in ( b"ready", b"continue", b"common", ): break pkt = proto.read_pkt_line() if CAPABILITY_SIDE_BAND_64K in capabilities or protocol_version == 2: if progress is None: # Just ignore progress data def progress(x: bytes) -> None: pass for chan, data in _read_side_band64k_data(proto.read_pkt_seq()): if chan == SIDE_BAND_CHANNEL_DATA: pack_data(data) elif chan == SIDE_BAND_CHANNEL_PROGRESS: progress(data) else: raise AssertionError(f"Invalid sideband channel {chan}") else: while True: data = proto.read(rbufsize) if data == b"": break pack_data(data) def _extract_symrefs_and_agent( capabilities: Iterable[bytes], ) -> tuple[dict[Ref, Ref], bytes | None]: """Extract symrefs and agent from capabilities. Args: capabilities: List of capabilities Returns: (symrefs, agent) tuple """ symrefs: dict[Ref, Ref] = {} agent = None for capability in capabilities: k, v = parse_capability(capability) if k == CAPABILITY_SYMREF: assert v is not None (src, dst) = v.split(b":", 1) symrefs[Ref(src)] = Ref(dst) if k == CAPABILITY_AGENT: agent = v return (symrefs, agent) # TODO(durin42): this doesn't correctly degrade if the server doesn't # support some capabilities. This should work properly with servers # that don't support multi_ack. class GitClient: """Git smart server client.""" def __init__( self, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Create a new GitClient instance. Args: thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity. quiet: Whether to suppress output include_tags: send annotated tags when sending the objects they point to """ self._report_activity = report_activity self._report_status_parser: ReportStatusParser | None = None self._fetch_capabilities = set(UPLOAD_CAPABILITIES) self._fetch_capabilities.add(capability_agent()) self._send_capabilities = set(RECEIVE_CAPABILITIES) self._send_capabilities.add(capability_agent()) if quiet: self._send_capabilities.add(CAPABILITY_QUIET) if not thin_packs: self._fetch_capabilities.remove(CAPABILITY_THIN_PACK) if include_tags: self._fetch_capabilities.add(CAPABILITY_INCLUDE_TAG) self.protocol_version = 0 # will be overridden later def close(self) -> None: """Close the client and release any resources. Default implementation does nothing as most clients don't maintain persistent connections. Subclasses that hold resources should override this method to properly clean them up. """ def get_url(self, path: str) -> str: """Retrieves full url to given path. Args: path: Repository path (as string) Returns: Url to path (as string) """ raise NotImplementedError(self.get_url) @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, ) -> "GitClient": """Create an instance of this client from a urlparse.parsed object. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb HTTP transport (only for HTTP) username: Optional username for authentication (only for HTTP) password: Optional password for authentication (only for HTTP) config: Optional configuration object Returns: A `GitClient` object """ raise NotImplementedError(cls.from_parsedurl) def send_pack( self, path: bytes, update_refs: Callable[[dict[Ref, ObjectID]], dict[Ref, ObjectID]], generate_pack_data: "GeneratePackDataFunc", progress: Callable[[bytes], None] | None = None, ) -> SendPackResult: """Upload a pack to a remote repository. Args: path: Repository path (as bytestring) update_refs: Function to determine changes to remote refs. Receive dict with existing remote refs, returns dict with changed refs (name -> sha, where sha=ZERO_SHA for deletions) generate_pack_data: Function that can return a tuple with number of objects and list of pack data to include progress: Optional progress function Returns: SendPackResult object Raises: SendPackError: if server rejects the pack data """ raise NotImplementedError(self.send_pack) def clone( self, path: str, target_path: str, mkdir: bool = True, bare: bool = False, origin: str | None = "origin", checkout: bool | None = None, branch: str | None = None, progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, ) -> Repo: """Clone a repository.""" if mkdir: os.mkdir(target_path) try: # For network clones, create repository with default SHA-1 format initially. # If remote uses a different format, fetch() will auto-change the repo's format # (since repo is empty at this point). # Subclasses (e.g., LocalGitClient) override to detect format first for efficiency. target = None if not bare: target = Repo.init(target_path) if checkout is None: checkout = True else: if checkout: raise ValueError("checkout and bare are incompatible") target = Repo.init_bare(target_path) # TODO(jelmer): abstract method for get_location? if isinstance(self, (LocalGitClient, SubprocessGitClient)): encoded_path = path.encode("utf-8") else: encoded_path = self.get_url(path).encode("utf-8") assert target is not None if origin is not None: target_config = target.get_config() target_config.set( (b"remote", origin.encode("utf-8")), b"url", encoded_path ) target_config.set( (b"remote", origin.encode("utf-8")), b"fetch", b"+refs/heads/*:refs/remotes/" + origin.encode("utf-8") + b"/*", ) target_config.write_to_path() ref_message = b"clone: from " + encoded_path result = self.fetch( path.encode("utf-8"), target, progress=progress, depth=depth, ref_prefix=ref_prefix, filter_spec=filter_spec, protocol_version=protocol_version, ) if origin is not None: _import_remote_refs( target.refs, origin, result.refs, message=ref_message ) origin_head = result.symrefs.get(HEADREF) origin_sha = result.refs.get(HEADREF) if origin is None or (origin_sha and not origin_head): # set detached HEAD if origin_sha is not None: target.refs[HEADREF] = origin_sha head = origin_sha else: head = None else: _set_origin_head(target.refs, origin.encode("utf-8"), origin_head) # If origin_head is None (missing HEAD), fall back to configured default branch default_branch: bytes | None = None if origin_head is None: target_config = target.get_config() try: default_branch_name = target_config.get( (b"init",), b"defaultBranch" ) except KeyError: # Git's default is "master" default_branch_name = b"master" default_ref = Ref(b"refs/remotes/origin/" + default_branch_name) if default_ref in target.refs: default_branch = default_branch_name head_ref = _set_default_branch( target.refs, origin.encode("utf-8"), origin_head, (branch.encode("utf-8") if branch is not None else default_branch), ref_message, ) # Update target head if head_ref: head = _set_head(target.refs, head_ref, ref_message) else: head = None if checkout and head is not None: target.get_worktree().reset_index() except BaseException: if target is not None: target.close() if mkdir: import shutil shutil.rmtree(target_path) raise return target def fetch( self, path: bytes | str, target: BaseRepo, determine_wants: "DetermineWantsFunc | None" = None, progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Fetch into a target repository. Args: path: Path to fetch from (as bytestring) target: Target repository to fetch into determine_wants: Optional function to determine what refs to fetch. Receives dictionary of name->sha, should return list of shas to fetch. Defaults to all shas. progress: Optional progress function depth: Depth to fetch at ref_prefix: List of prefixes of desired references, as a list of bytestrings. Filtering is done by the server if supported, and client side otherwise. filter_spec: A git-rev-list-style object filter spec, as bytestring. Only used if the server supports the Git protocol-v2 'filter' feature, and ignored otherwise. protocol_version: Desired Git protocol version. By default the highest mutually supported protocol version will be used. shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs Returns: Dictionary with all remote refs (not just those fetched) """ if determine_wants is None: determine_wants = target.object_store.determine_wants_all if CAPABILITY_THIN_PACK in self._fetch_capabilities: from tempfile import SpooledTemporaryFile f: IO[bytes] = SpooledTemporaryFile( max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-", dir=getattr(target.object_store, "path", None), ) def commit() -> None: if f.tell(): f.seek(0) target.object_store.add_thin_pack(f.read, None, progress=progress) # type: ignore f.close() def abort() -> None: f.close() else: f, commit, abort = target.object_store.add_pack() try: result = self.fetch_pack( path, determine_wants, target.get_graph_walker(), f.write, progress=progress, depth=depth, ref_prefix=ref_prefix, filter_spec=filter_spec, protocol_version=protocol_version, shallow_since=shallow_since, shallow_exclude=shallow_exclude, ) # Fix object format if needed if ( result.object_format and result.object_format != target.object_format.name ): # Change the target repo's format if it's empty target._change_object_format(result.object_format) except BaseException: abort() raise else: commit() target.update_shallow(result.new_shallow, result.new_unshallow) return result def fetch_pack( self, path: bytes | str, determine_wants: "DetermineWantsFunc", graph_walker: GraphWalker, pack_data: Callable[[bytes], int], *, progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Retrieve a pack from a git smart server. Args: path: Remote path to fetch from determine_wants: Function determine what refs to fetch. Receives dictionary of name->sha, should return list of shas to fetch. graph_walker: Object with next() and ack(). pack_data: Callback called for each bit of data in the pack progress: Callback for progress reports (strings) depth: Shallow fetch depth ref_prefix: List of prefixes of desired references, as a list of bytestrings. Filtering is done by the server if supported, and client side otherwise. filter_spec: A git-rev-list-style object filter spec, as bytestring. Only used if the server supports the Git protocol-v2 'filter' feature, and ignored otherwise. protocol_version: Desired Git protocol version. By default the highest mutually supported protocol version will be used. shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs Returns: FetchPackResult object """ raise NotImplementedError(self.fetch_pack) def get_refs( self, path: bytes, protocol_version: int | None = None, ref_prefix: Sequence[bytes] | None = None, ) -> LsRemoteResult: """Retrieve the current refs from a git smart server. Args: path: Path to the repo to fetch from. (as bytestring) protocol_version: Desired Git protocol version. ref_prefix: Prefix filter for refs. Returns: LsRemoteResult object with refs and symrefs """ raise NotImplementedError(self.get_refs) @staticmethod def _should_send_pack(new_refs: Mapping[Ref, ObjectID]) -> bool: # The packfile MUST NOT be sent if the only command used is delete. return any(sha != ZERO_SHA for sha in new_refs.values()) def _negotiate_receive_pack_capabilities( self, server_capabilities: set[bytes] ) -> tuple[set[bytes], bytes | None]: negotiated_capabilities = self._send_capabilities & server_capabilities (_symrefs, agent) = _extract_symrefs_and_agent(server_capabilities) (extract_capability_names(server_capabilities) - KNOWN_RECEIVE_CAPABILITIES) # TODO(jelmer): warn about unknown capabilities return (negotiated_capabilities, agent) def _handle_receive_pack_tail( self, proto: Protocol, capabilities: Set[bytes], progress: Callable[[bytes], None] | None = None, ) -> dict[bytes, str | None] | None: """Handle the tail of a 'git-receive-pack' request. Args: proto: Protocol object to read from capabilities: List of negotiated capabilities progress: Optional progress reporting function Returns: dict mapping ref name to: error message if the ref failed to update None if it was updated successfully """ if CAPABILITY_SIDE_BAND_64K in capabilities or self.protocol_version == 2: if progress is None: def progress(x: bytes) -> None: pass if CAPABILITY_REPORT_STATUS in capabilities: assert self._report_status_parser is not None pktline_parser = PktLineParser(self._report_status_parser.handle_packet) for chan, data in _read_side_band64k_data(proto.read_pkt_seq()): if chan == SIDE_BAND_CHANNEL_DATA: if CAPABILITY_REPORT_STATUS in capabilities: pktline_parser.parse(data) elif chan == SIDE_BAND_CHANNEL_PROGRESS: progress(data) else: raise AssertionError(f"Invalid sideband channel {chan}") else: if CAPABILITY_REPORT_STATUS in capabilities: assert self._report_status_parser for pkt in proto.read_pkt_seq(): self._report_status_parser.handle_packet(pkt) if self._report_status_parser is not None: return dict(self._report_status_parser.check()) return None def _negotiate_upload_pack_capabilities( self, server_capabilities: set[bytes] ) -> tuple[set[bytes], dict[Ref, Ref], bytes | None]: (extract_capability_names(server_capabilities) - KNOWN_UPLOAD_CAPABILITIES) # TODO(jelmer): warn about unknown capabilities fetch_capa = None for capability in server_capabilities: k, v = parse_capability(capability) if self.protocol_version == 2 and k == CAPABILITY_FETCH: fetch_capa = CAPABILITY_FETCH fetch_features = [] assert v is not None v_list = v.strip().split(b" ") if b"shallow" in v_list: fetch_features.append(CAPABILITY_SHALLOW) if b"filter" in v_list: fetch_features.append(CAPABILITY_FILTER) for i in range(len(fetch_features)): if i == 0: fetch_capa += b"=" else: fetch_capa += b" " fetch_capa += fetch_features[i] (symrefs, agent) = _extract_symrefs_and_agent(server_capabilities) negotiated_capabilities = self._fetch_capabilities & server_capabilities if fetch_capa: negotiated_capabilities.add(fetch_capa) return (negotiated_capabilities, symrefs, agent) def archive( self, path: bytes, committish: bytes, write_data: Callable[[bytes], None], progress: Callable[[bytes], None] | None = None, write_error: Callable[[bytes], None] | None = None, format: bytes | None = None, subdirs: Sequence[bytes] | None = None, prefix: bytes | None = None, ) -> None: """Retrieve an archive of the specified tree.""" raise NotImplementedError(self.archive) @staticmethod def _warn_filter_objects() -> None: logging.warning("object filtering not recognized by server, ignoring") def check_wants(wants: Set[bytes], refs: Mapping[bytes, bytes]) -> None: """Check that a set of wants is valid. Args: wants: Set of object SHAs to fetch refs: Refs dictionary to check against """ missing = set(wants) - { v for (k, v) in refs.items() if not k.endswith(PEELED_TAG_SUFFIX) } if missing: raise InvalidWants(missing) def _remote_error_from_stderr(stderr: IO[bytes] | None) -> Exception: if stderr is None: return HangupException() lines = [line.rstrip(b"\n") for line in stderr.readlines()] for line in lines: if line.startswith(b"ERROR: "): return GitProtocolError(line[len(b"ERROR: ") :].decode("utf-8", "replace")) return HangupException(lines) class TraditionalGitClient(GitClient): """Traditional Git client.""" DEFAULT_ENCODING = "utf-8" def __init__( self, path_encoding: str = DEFAULT_ENCODING, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Initialize a TraditionalGitClient. Args: path_encoding: Encoding for paths (default: utf-8) thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags """ self._remote_path_encoding = path_encoding super().__init__( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) def _connect( self, cmd: bytes, path: str | bytes, protocol_version: int | None = None, ) -> tuple[Protocol, Callable[[], bool], IO[bytes] | None]: """Create a connection to the server. This method is abstract - concrete implementations should implement their own variant which connects to the server and returns an initialized Protocol object with the service ready for use and a can_read function which may be used to see if reads would block. Args: cmd: The git service name to which we should connect. path: The path we should pass to the service. (as bytestirng) protocol_version: Desired Git protocol version. By default the highest mutually supported protocol version will be used. """ raise NotImplementedError def send_pack( self, path: bytes, update_refs: Callable[[dict[Ref, ObjectID]], dict[Ref, ObjectID]], generate_pack_data: "GeneratePackDataFunc", progress: Callable[[bytes], None] | None = None, ) -> SendPackResult: """Upload a pack to a remote repository. Args: path: Repository path (as bytestring) update_refs: Function to determine changes to remote refs. Receive dict with existing remote refs, returns dict with changed refs (name -> sha, where sha=ZERO_SHA for deletions) generate_pack_data: Function that can return a tuple with number of objects and pack data to upload. progress: Optional callback called with progress updates Returns: SendPackResult Raises: SendPackError: if server rejects the pack data """ self.protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_SEND proto, _unused_can_read, stderr = self._connect(b"receive-pack", path) with proto: try: old_refs, server_capabilities = read_pkt_refs_v1(proto.read_pkt_seq()) except HangupException as exc: raise _remote_error_from_stderr(stderr) from exc ( negotiated_capabilities, agent, ) = self._negotiate_receive_pack_capabilities(server_capabilities) if CAPABILITY_REPORT_STATUS in negotiated_capabilities: self._report_status_parser = ReportStatusParser() report_status_parser = self._report_status_parser try: new_refs = orig_new_refs = update_refs(old_refs) except BaseException: proto.write_pkt_line(None) raise if set(new_refs.items()).issubset(set(old_refs.items())): proto.write_pkt_line(None) # Convert new_refs to match SendPackResult expected type return SendPackResult( _to_optional_dict(new_refs), agent=agent, ref_status={} ) if CAPABILITY_DELETE_REFS not in server_capabilities: # Server does not support deletions. Fail later. new_refs = dict(orig_new_refs) for ref, sha in orig_new_refs.items(): if sha == ZERO_SHA: if CAPABILITY_REPORT_STATUS in negotiated_capabilities: assert report_status_parser is not None report_status_parser._ref_statuses.append( b"ng " + ref + b" remote does not support deleting refs" ) del new_refs[ref] if new_refs is None: proto.write_pkt_line(None) return SendPackResult(old_refs, agent=agent, ref_status={}) if len(new_refs) == 0 and orig_new_refs: # NOOP - Original new refs filtered out by policy proto.write_pkt_line(None) if report_status_parser is not None: ref_status = dict(report_status_parser.check()) else: ref_status = None # Convert to Optional type for SendPackResult return SendPackResult( _to_optional_dict(old_refs), agent=agent, ref_status=ref_status ) header_handler = _v1ReceivePackHeader( list(negotiated_capabilities), old_refs, new_refs, ) for pkt in header_handler: proto.write_pkt_line(pkt) pack_data_count, pack_data = generate_pack_data( header_handler.have, header_handler.want, ofs_delta=(CAPABILITY_OFS_DELTA in negotiated_capabilities), progress=progress, ) if self._should_send_pack(new_refs): for chunk in PackChunkGenerator( num_records=pack_data_count, records=pack_data, object_format=DEFAULT_OBJECT_FORMAT, ): proto.write(chunk) ref_status = self._handle_receive_pack_tail( proto, negotiated_capabilities, progress ) return SendPackResult( _to_optional_dict(new_refs), agent=agent, ref_status=ref_status ) def fetch_pack( self, path: bytes | str, determine_wants: "DetermineWantsFunc", graph_walker: GraphWalker, pack_data: Callable[[bytes], int], progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Retrieve a pack from a git smart server. Args: path: Remote path to fetch from determine_wants: Function determine what refs to fetch. Receives dictionary of name->sha, should return list of shas to fetch. graph_walker: Object with next() and ack(). pack_data: Callback called for each bit of data in the pack progress: Callback for progress reports (strings) depth: Shallow fetch depth ref_prefix: List of prefixes of desired references, as a list of bytestrings. Filtering is done by the server if supported, and client side otherwise. filter_spec: A git-rev-list-style object filter spec, as bytestring. Only used if the server supports the Git protocol-v2 'filter' feature, and ignored otherwise. protocol_version: Desired Git protocol version. By default the highest mutually supported protocol version will be used. shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs Returns: FetchPackResult object """ if ( protocol_version is not None and protocol_version not in GIT_PROTOCOL_VERSIONS ): raise ValueError(f"unknown Git protocol version {protocol_version}") proto, can_read, stderr = self._connect(b"upload-pack", path, protocol_version) server_protocol_version = negotiate_protocol_version(proto) if server_protocol_version not in GIT_PROTOCOL_VERSIONS: raise ValueError( f"unknown Git protocol version {server_protocol_version} used by server" ) if protocol_version and server_protocol_version > protocol_version: raise ValueError( f"bad Git protocol version {server_protocol_version} used by server" ) self.protocol_version = server_protocol_version with proto: # refs may have None values in v2 but not in v1 refs: dict[Ref, ObjectID | None] symrefs: dict[Ref, Ref] agent: bytes | None object_format: str | None if self.protocol_version == 2: try: server_capabilities = read_server_capabilities(proto.read_pkt_seq()) except HangupException as exc: raise _remote_error_from_stderr(stderr) from exc ( negotiated_capabilities, symrefs, agent, ) = self._negotiate_upload_pack_capabilities(server_capabilities) object_format = extract_object_format_from_capabilities( server_capabilities ) proto.write_pkt_line(b"command=ls-refs\n") proto.write(b"0001") # delim-pkt proto.write_pkt_line(b"symrefs") proto.write_pkt_line(b"peel") if ref_prefix is None: ref_prefix = DEFAULT_REF_PREFIX for prefix in ref_prefix: proto.write_pkt_line(b"ref-prefix " + prefix) proto.write_pkt_line(None) refs, symrefs, _peeled = read_pkt_refs_v2(proto.read_pkt_seq()) else: try: refs_v1, server_capabilities = read_pkt_refs_v1( proto.read_pkt_seq() ) # v1 refs never have None values, but we need Optional type for compatibility refs = _to_optional_dict(refs_v1) except HangupException as exc: raise _remote_error_from_stderr(stderr) from exc ( negotiated_capabilities, symrefs, agent, ) = self._negotiate_upload_pack_capabilities(server_capabilities) object_format = extract_object_format_from_capabilities( server_capabilities ) if ref_prefix is not None: refs = filter_ref_prefix(refs, ref_prefix) if refs is None: proto.write_pkt_line(None) return FetchPackResult( refs, symrefs, agent, object_format=object_format ) try: # Filter out None values (shouldn't be any in v1 protocol) refs_no_none = {k: v for k, v in refs.items() if v is not None} # Handle both old and new style determine_wants try: wants = determine_wants(refs_no_none, depth) except TypeError: # Old-style determine_wants that doesn't accept depth wants = determine_wants(refs_no_none) except BaseException: proto.write_pkt_line(None) raise if wants is not None: wants = [cid for cid in wants if cid != ZERO_SHA] if not wants: proto.write_pkt_line(None) return FetchPackResult( refs, symrefs, agent, object_format=object_format ) if self.protocol_version == 2: proto.write_pkt_line(b"command=fetch\n") proto.write(b"0001") # delim-pkt if CAPABILITY_THIN_PACK in self._fetch_capabilities: proto.write(pkt_line(b"thin-pack\n")) if ( find_capability( list(negotiated_capabilities), CAPABILITY_FETCH, CAPABILITY_FILTER, ) and filter_spec ): proto.write(pkt_line(b"filter %s\n" % filter_spec)) elif filter_spec: self._warn_filter_objects() elif filter_spec: self._warn_filter_objects() (new_shallow, new_unshallow) = _handle_upload_pack_head( proto, list(negotiated_capabilities), graph_walker, wants, can_read, depth=depth, protocol_version=self.protocol_version, shallow_since=shallow_since, shallow_exclude=shallow_exclude, ) _handle_upload_pack_tail( proto, negotiated_capabilities, graph_walker, pack_data, progress, protocol_version=self.protocol_version, ) return FetchPackResult( refs, symrefs, agent, new_shallow, new_unshallow, object_format ) def get_refs( self, path: bytes, protocol_version: int | None = None, ref_prefix: Sequence[bytes] | None = None, ) -> LsRemoteResult: """Retrieve the current refs from a git smart server.""" # stock `git ls-remote` uses upload-pack if ( protocol_version is not None and protocol_version not in GIT_PROTOCOL_VERSIONS ): raise ValueError(f"unknown Git protocol version {protocol_version}") proto, _, stderr = self._connect(b"upload-pack", path, protocol_version) server_protocol_version = negotiate_protocol_version(proto) if server_protocol_version not in GIT_PROTOCOL_VERSIONS: raise ValueError( f"unknown Git protocol version {server_protocol_version} used by server" ) if protocol_version and server_protocol_version > protocol_version: raise ValueError( f"bad Git protocol version {server_protocol_version} used by server" ) self.protocol_version = server_protocol_version if self.protocol_version == 2: server_capabilities = read_server_capabilities(proto.read_pkt_seq()) object_format = extract_object_format_from_capabilities(server_capabilities) proto.write_pkt_line(b"command=ls-refs\n") proto.write(b"0001") # delim-pkt proto.write_pkt_line(b"symrefs") proto.write_pkt_line(b"peel") if ref_prefix is None: ref_prefix = DEFAULT_REF_PREFIX for prefix in ref_prefix: proto.write_pkt_line(b"ref-prefix " + prefix) proto.write_pkt_line(None) with proto: try: refs, symrefs, peeled = read_pkt_refs_v2(proto.read_pkt_seq()) except HangupException as exc: raise _remote_error_from_stderr(stderr) from exc proto.write_pkt_line(None) for refname, refvalue in peeled.items(): refs[Ref(refname + PEELED_TAG_SUFFIX)] = refvalue return LsRemoteResult(refs, symrefs, object_format=object_format) else: with proto: try: refs_v1, server_capabilities = read_pkt_refs_v1( proto.read_pkt_seq() ) # v1 refs never have None values, but we need Optional type for compatibility refs = _to_optional_dict(refs_v1) except HangupException as exc: raise _remote_error_from_stderr(stderr) from exc proto.write_pkt_line(None) object_format = extract_object_format_from_capabilities( server_capabilities ) (symrefs, _agent) = _extract_symrefs_and_agent(server_capabilities) if ref_prefix is not None: refs = filter_ref_prefix(refs, ref_prefix) return LsRemoteResult(refs, symrefs, object_format=object_format) def archive( self, path: bytes, committish: bytes, write_data: Callable[[bytes], None], progress: Callable[[bytes], None] | None = None, write_error: Callable[[bytes], None] | None = None, format: bytes | None = None, subdirs: Sequence[bytes] | None = None, prefix: bytes | None = None, ) -> None: """Request an archive of a specific commit. Args: path: Repository path committish: Commit ID or ref to archive write_data: Function to write archive data progress: Optional progress callback write_error: Optional error callback format: Optional archive format subdirs: Optional subdirectories to include prefix: Optional prefix for archived files """ proto, _can_read, stderr = self._connect(b"upload-archive", path) with proto: if format is not None: proto.write_pkt_line(b"argument --format=" + format) proto.write_pkt_line(b"argument " + committish) if subdirs is not None: for subdir in subdirs: proto.write_pkt_line(b"argument " + subdir) if prefix is not None: proto.write_pkt_line(b"argument --prefix=" + prefix) proto.write_pkt_line(None) try: pkt = proto.read_pkt_line() except HangupException as exc: raise _remote_error_from_stderr(stderr) from exc if pkt == b"NACK\n" or pkt == b"NACK": return elif pkt == b"ACK\n" or pkt == b"ACK": pass elif pkt and pkt.startswith(b"ERR "): raise GitProtocolError(pkt[4:].rstrip(b"\n").decode("utf-8", "replace")) else: raise AssertionError(f"invalid response {pkt!r}") ret = proto.read_pkt_line() if ret is not None: raise AssertionError("expected pkt tail") for chan, data in _read_side_band64k_data(proto.read_pkt_seq()): if chan == SIDE_BAND_CHANNEL_DATA: write_data(data) elif chan == SIDE_BAND_CHANNEL_PROGRESS: if progress is not None: progress(data) elif chan == SIDE_BAND_CHANNEL_FATAL: if write_error is not None: write_error(data) else: raise AssertionError(f"Invalid sideband channel {chan}") class TCPGitClient(TraditionalGitClient): """A Git Client that works over TCP directly (i.e. git://).""" def __init__( self, host: str, port: int | None = None, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Initialize a TCPGitClient. Args: host: Hostname or IP address to connect to port: Port number (defaults to TCP_GIT_PORT) thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags """ if port is None: port = TCP_GIT_PORT self._host = host self._port = port super().__init__( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, ) -> "TCPGitClient": """Create an instance of TCPGitClient from a parsed URL. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb protocol (not used for TCPGitClient) username: Username for authentication (not used for TCPGitClient) password: Password for authentication (not used for TCPGitClient) config: Configuration object (not used for TCPGitClient) Returns: A TCPGitClient instance """ assert parsedurl.hostname is not None return cls( parsedurl.hostname, port=parsedurl.port, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) def get_url(self, path: str) -> str: r"""Get the URL for a TCP git connection. Args: path: Repository path Returns: ``git://`` URL for the path """ # IPv6 addresses contain colons and need to be wrapped in brackets if ":" in self._host: netloc = f"[{self._host}]" else: netloc = self._host if self._port is not None and self._port != TCP_GIT_PORT: netloc += f":{self._port}" return urlunsplit(("git", netloc, path, "", "")) def _connect( self, cmd: bytes, path: str | bytes, protocol_version: int | None = None, ) -> tuple[Protocol, Callable[[], bool], IO[bytes] | None]: if not isinstance(cmd, bytes): raise TypeError(cmd) if not isinstance(path, bytes): path = path.encode(self._remote_path_encoding) sockaddrs = socket.getaddrinfo( self._host, self._port, socket.AF_UNSPEC, socket.SOCK_STREAM ) s = None err = OSError(f"no address found for {self._host}") for family, socktype, protof, canonname, sockaddr in sockaddrs: s = socket.socket(family, socktype, protof) s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) try: s.connect(sockaddr) break except OSError as e: err = e if s is not None: s.close() s = None if s is None: raise err # -1 means system default buffering rfile = s.makefile("rb", -1) # 0 means unbuffered wfile = s.makefile("wb", 0) def close() -> None: rfile.close() wfile.close() s.close() proto = Protocol( rfile.read, wfile.write, close, report_activity=self._report_activity, ) if path.startswith(b"/~"): path = path[1:] if cmd == b"upload-pack": if protocol_version is None: self.protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_FETCH else: self.protocol_version = protocol_version else: self.protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_SEND if cmd == b"upload-pack" and self.protocol_version == 2: # Git protocol version advertisement is hidden behind two NUL bytes # for compatibility with older Git server implementations, which # would crash if something other than a "host=" header was found # after the first NUL byte. version_str = b"\0\0version=%d\0" % self.protocol_version else: version_str = b"" # TODO(jelmer): Alternative to ascii? proto.send_cmd( b"git-" + cmd, path, b"host=" + self._host.encode("ascii") + version_str ) return proto, lambda: _fileno_can_read(s.fileno()), None class SubprocessWrapper: """A socket-like object that talks to a subprocess via pipes.""" def __init__(self, proc: subprocess.Popen[bytes]) -> None: """Initialize a SubprocessWrapper. Args: proc: Subprocess.Popen instance to wrap """ self.proc = proc assert proc.stdout is not None assert proc.stdin is not None self.read = BufferedReader(proc.stdout).read # type: ignore[type-var] self.write = proc.stdin.write @property def stderr(self) -> IO[bytes] | None: """Return the stderr stream of the subprocess.""" return self.proc.stderr def can_read(self) -> bool: """Check if there is data available to read. Returns: True if data is available, False otherwise """ if sys.platform == "win32": from msvcrt import get_osfhandle assert self.proc.stdout is not None handle = get_osfhandle(self.proc.stdout.fileno()) return _win32_peek_avail(handle) != 0 else: assert self.proc.stdout is not None return _fileno_can_read(self.proc.stdout.fileno()) def close(self, timeout: int | None = 60) -> None: """Close the subprocess and wait for it to terminate. Args: timeout: Maximum time to wait for subprocess to terminate (seconds) Raises: GitProtocolError: If subprocess doesn't terminate within timeout """ if self.proc.stdin: self.proc.stdin.close() if self.proc.stdout: self.proc.stdout.close() if self.proc.stderr: self.proc.stderr.close() try: self.proc.wait(timeout=timeout) except subprocess.TimeoutExpired as e: self.proc.kill() self.proc.wait() raise GitProtocolError( f"Git subprocess did not terminate within {timeout} seconds; killed it." ) from e def find_git_command() -> list[str]: """Find command to run for system Git (usually C Git).""" if sys.platform == "win32": # support .exe, .bat and .cmd try: # to avoid overhead import pywintypes import win32api except ImportError: # run through cmd.exe with some overhead return ["cmd", "/c", "git"] else: try: _status, git = win32api.FindExecutable("git") return [git] except pywintypes.error: return ["cmd", "/c", "git"] else: return ["git"] class SubprocessGitClient(TraditionalGitClient): """Git client that talks to a server using a subprocess.""" @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, ) -> "SubprocessGitClient": """Create an instance of SubprocessGitClient from a parsed URL. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb protocol (not used for SubprocessGitClient) username: Username for authentication (not used for SubprocessGitClient) password: Password for authentication (not used for SubprocessGitClient) config: Configuration object (not used for SubprocessGitClient) Returns: A SubprocessGitClient instance """ return cls( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) git_command: str | None = None def _connect( self, service: bytes, path: bytes | str, protocol_version: int | None = None, ) -> tuple[Protocol, Callable[[], bool], IO[bytes] | None]: if not isinstance(service, bytes): raise TypeError(service) if isinstance(path, bytes): path = path.decode(self._remote_path_encoding) if self.git_command is None: git_command = find_git_command() argv = [*git_command, service.decode("ascii"), path] p = subprocess.Popen( argv, bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) pw = SubprocessWrapper(p) return ( Protocol( pw.read, pw.write, pw.close, report_activity=self._report_activity, ), pw.can_read, p.stderr, ) class LocalGitClient(GitClient): """Git Client that just uses a local on-disk repository.""" def __init__( self, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, config: Config | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Create a new LocalGitClient instance. Args: thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity. config: Optional configuration object quiet: Whether to suppress progress output include_tags: Whether to include tags """ self._report_activity = report_activity self._quiet = quiet self._include_tags = include_tags # Ignore the thin_packs argument def get_url(self, path: str) -> str: """Get the URL for a local file path. Args: path: Local file path Returns: file:// URL for the path """ return urlunsplit(("file", "", path, "", "")) @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, ) -> "LocalGitClient": """Create an instance of LocalGitClient from a parsed URL. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb protocol (not used for LocalGitClient) username: Username for authentication (not used for LocalGitClient) password: Password for authentication (not used for LocalGitClient) config: Optional configuration object Returns: A LocalGitClient instance """ return cls( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, config=config, ) @classmethod def _open_repo(cls, path: str | bytes) -> "closing[Repo]": """Open a local repository. Args: path: Repository path (as bytes or str) Returns: Repo instance wrapped in a closing context manager """ if not isinstance(path, str): path = os.fsdecode(path) return closing(Repo(path)) def send_pack( self, path: str | bytes, update_refs: Callable[[dict[Ref, ObjectID]], dict[Ref, ObjectID]], generate_pack_data: "GeneratePackDataFunc", progress: Callable[[bytes], None] | None = None, ) -> SendPackResult: """Upload a pack to a local on-disk repository. Args: path: Repository path (as bytestring) update_refs: Function to determine changes to remote refs. Receive dict with existing remote refs, returns dict with changed refs (name -> sha, where sha=ZERO_SHA for deletions) with number of items and pack data to upload. generate_pack_data: Function that generates pack data given have and want object sets progress: Optional progress function Returns: SendPackResult Raises: SendPackError: if server rejects the pack data """ if not progress: def progress(x: bytes) -> None: pass with self._open_repo(path) as target: old_refs = target.get_refs() new_refs = update_refs(dict(old_refs)) have = [sha1 for sha1 in old_refs.values() if sha1 != ZERO_SHA] want = [] for refname, new_sha1 in new_refs.items(): if ( new_sha1 not in have and new_sha1 not in want and new_sha1 != ZERO_SHA ): want.append(new_sha1) if not want and set(new_refs.items()).issubset(set(old_refs.items())): return SendPackResult(_to_optional_dict(new_refs), ref_status={}) target.object_store.add_pack_data( *generate_pack_data( set(have), set(want), ofs_delta=True, progress=progress ) ) ref_status: dict[bytes, str | None] = {} for refname, new_sha1 in new_refs.items(): old_sha1 = old_refs.get(refname, ZERO_SHA) if new_sha1 != ZERO_SHA: if not target.refs.set_if_equals(refname, old_sha1, new_sha1): msg = f"unable to set {refname!r} to {new_sha1!r}" progress(msg.encode()) ref_status[refname] = msg else: if not target.refs.remove_if_equals(refname, old_sha1): progress(f"unable to remove {refname!r}".encode()) ref_status[refname] = "unable to remove" return SendPackResult(_to_optional_dict(new_refs), ref_status=ref_status) def fetch( self, path: bytes | str, target: BaseRepo, determine_wants: "DetermineWantsFunc | None" = None, progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Fetch into a target repository. Args: path: Path to fetch from (as bytestring) target: Target repository to fetch into determine_wants: Optional function determine what refs to fetch. Receives dictionary of name->sha, should return list of shas to fetch. Defaults to all shas. progress: Optional progress function depth: Shallow fetch depth ref_prefix: List of prefixes of desired references, as a list of bytestrings. Filtering is done by the server if supported, and client side otherwise. filter_spec: A git-rev-list-style object filter spec, as bytestring. Only used if the server supports the Git protocol-v2 'filter' feature, and ignored otherwise. protocol_version: Optional Git protocol version shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs Returns: FetchPackResult object """ with self._open_repo(path) as r: refs = r.fetch( target, determine_wants=determine_wants, progress=progress, depth=depth, ) return FetchPackResult( _to_optional_dict(refs), r.refs.get_symrefs(), agent_string(), object_format=r.object_format.name, ) def fetch_pack( self, path: str | bytes, determine_wants: "DetermineWantsFunc", graph_walker: GraphWalker, pack_data: Callable[[bytes], int], progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Retrieve a pack from a local on-disk repository. Args: path: Remote path to fetch from determine_wants: Function determine what refs to fetch. Receives dictionary of name->sha, should return list of shas to fetch. graph_walker: Object with next() and ack(). pack_data: Callback called for each bit of data in the pack progress: Callback for progress reports (strings) depth: Shallow fetch depth ref_prefix: List of prefixes of desired references, as a list of bytestrings. Filtering is done by the server if supported, and client side otherwise. filter_spec: A git-rev-list-style object filter spec, as bytestring. Only used if the server supports the Git protocol-v2 'filter' feature, and ignored otherwise. protocol_version: Optional Git protocol version shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs Returns: FetchPackResult object """ with self._open_repo(path) as r: missing_objects = r.find_missing_objects( determine_wants, graph_walker, progress=progress, depth=depth ) if missing_objects is None: other_haves = set() object_ids = [] else: other_haves = missing_objects.get_remote_has() object_ids = list(missing_objects) symrefs = r.refs.get_symrefs() agent = agent_string() # Did the process short-circuit (e.g. in a stateless RPC call)? # Note that the client still expects a 0-object pack in most cases. if object_ids is None: return FetchPackResult( None, symrefs, agent, object_format=r.object_format.name ) write_pack_from_container( pack_data, # type: ignore[arg-type] r.object_store, object_ids, other_haves=other_haves, object_format=r.object_format, ) # Convert refs to Optional type for FetchPackResult return FetchPackResult( _to_optional_dict(r.get_refs()), symrefs, agent, object_format=r.object_format.name, ) def get_refs( self, path: str | bytes, protocol_version: int | None = None, ref_prefix: Sequence[bytes] | None = None, ) -> LsRemoteResult: """Retrieve the current refs from a local on-disk repository.""" with self._open_repo(path) as target: refs_dict = target.get_refs() refs = _to_optional_dict(refs_dict) # Extract symrefs from the local repository from dulwich.refs import Ref symrefs: dict[Ref, Ref] = {} for ref in refs: try: # Check if this ref is symbolic by reading it directly ref_value = target.refs.read_ref(ref) if ref_value and ref_value.startswith(SYMREF): # Extract the target from the symref symrefs[ref] = Ref(ref_value[len(SYMREF) :]) except (KeyError, ValueError): # Not a symbolic ref or error reading it pass return LsRemoteResult( refs, symrefs, object_format=target.object_format.name ) def clone( self, path: str, target_path: str, mkdir: bool = True, bare: bool = False, origin: str | None = "origin", checkout: bool | None = None, branch: str | None = None, progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, ) -> Repo: """Clone a local repository. For local clones, we can detect the object format before creating the target repository. """ # Detect the object format from the source repository with self._open_repo(path) as source_repo: object_format_name = source_repo.object_format.name if mkdir: os.mkdir(target_path) try: # Create repository with the correct object format from the start target = None if not bare: target = Repo.init(target_path, object_format=object_format_name) if checkout is None: checkout = True else: if checkout: raise ValueError("checkout and bare are incompatible") target = Repo.init_bare(target_path, object_format=object_format_name) encoded_path = path.encode("utf-8") assert target is not None if origin is not None: target_config = target.get_config() target_config.set( (b"remote", origin.encode("utf-8")), b"url", encoded_path ) target_config.set( (b"remote", origin.encode("utf-8")), b"fetch", b"+refs/heads/*:refs/remotes/" + origin.encode("utf-8") + b"/*", ) target_config.write_to_path() ref_message = b"clone: from " + encoded_path result = self.fetch( path.encode("utf-8"), target, progress=progress, depth=depth, ref_prefix=ref_prefix, filter_spec=filter_spec, protocol_version=protocol_version, ) if origin is not None: _import_remote_refs( target.refs, origin, result.refs, message=ref_message ) origin_head = result.symrefs.get(HEADREF) origin_sha = result.refs.get(HEADREF) if origin is None or (origin_sha and not origin_head): # set detached HEAD if origin_sha is not None: target.refs[HEADREF] = origin_sha head = origin_sha else: head = None else: _set_origin_head(target.refs, origin.encode("utf-8"), origin_head) head_ref = _set_default_branch( target.refs, origin.encode("utf-8"), origin_head, branch.encode("utf-8") if branch is not None else None, ref_message, ) # Update target head if head_ref: head = _set_head(target.refs, head_ref, ref_message) else: head = None if checkout and head is not None: target.get_worktree().reset_index() except BaseException: if target is not None: target.close() if mkdir: import shutil shutil.rmtree(target_path) raise return target class BundleClient(GitClient): """Git Client that reads from a bundle file.""" def __init__( self, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, config: Config | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Create a new BundleClient instance. Args: thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity. config: Optional configuration object quiet: Whether to suppress progress output include_tags: Whether to include tags """ self._report_activity = report_activity self._quiet = quiet self._include_tags = include_tags def get_url(self, path: str) -> str: """Get the URL for a bundle file path. Args: path: Bundle file path Returns: The path unchanged (bundle files use local paths) """ return path @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, ) -> "BundleClient": """Create an instance of BundleClient from a parsed URL. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb protocol (not used for BundleClient) username: Username for authentication (not used for BundleClient) password: Password for authentication (not used for BundleClient) config: Configuration object (not used for BundleClient) Returns: A BundleClient instance """ return cls( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) @classmethod def _is_bundle_file(cls, path: str) -> bool: """Check if a file is a git bundle by reading the first line.""" try: with open(path, "rb") as f: first_line = f.readline() return first_line in (b"# v2 git bundle\n", b"# v3 git bundle\n") except OSError: return False @classmethod def _open_bundle(cls, path: str | bytes) -> "Bundle": """Open and parse a bundle file. Args: path: Path to the bundle file (bytes or str) Returns: Bundle object with parsed metadata Raises: AssertionError: If bundle format is unsupported """ if not isinstance(path, str): path = os.fsdecode(path) # Read bundle metadata without PackData to avoid file handle issues with open(path, "rb") as f: from dulwich.bundle import Bundle version = None firstline = f.readline() if firstline == b"# v2 git bundle\n": version = 2 elif firstline == b"# v3 git bundle\n": version = 3 else: raise AssertionError(f"unsupported bundle format header: {firstline!r}") capabilities: dict[str, str | None] = {} prerequisites: list[tuple[ObjectID, bytes]] = [] references: dict[Ref, ObjectID] = {} line = f.readline() if version >= 3: while line.startswith(b"@"): line = line[1:].rstrip(b"\n") try: key, value_bytes = line.split(b"=", 1) value = value_bytes.decode("utf-8") except ValueError: key = line value = None capabilities[key.decode("utf-8")] = value line = f.readline() while line.startswith(b"-"): (obj_id, comment) = line[1:].rstrip(b"\n").split(b" ", 1) prerequisites.append((ObjectID(obj_id), comment)) line = f.readline() while line != b"\n": (obj_id, ref) = line.rstrip(b"\n").split(b" ", 1) references[Ref(ref)] = ObjectID(obj_id) line = f.readline() # Don't read PackData here, we'll do it later bundle = Bundle() bundle.version = version bundle.capabilities = capabilities bundle.prerequisites = prerequisites bundle.references = references bundle.pack_data = None # Will be read on demand return bundle @staticmethod def _skip_to_pack_data(f: IO[bytes], version: int) -> None: """Skip to the pack data section in a bundle file. Args: f: File object positioned at the beginning of the bundle version: Bundle format version (2 or 3) Raises: AssertionError: If bundle header is invalid """ # Skip header header = f.readline() if header not in (b"# v2 git bundle\n", b"# v3 git bundle\n"): raise AssertionError(f"Invalid bundle header: {header!r}") line = f.readline() # Skip capabilities (v3 only) if version >= 3: while line.startswith(b"@"): line = f.readline() # Skip prerequisites while line.startswith(b"-"): line = f.readline() # Skip references while line != b"\n": line = f.readline() # Now at pack data def send_pack( self, path: str | bytes, update_refs: Callable[[dict[Ref, ObjectID]], dict[Ref, ObjectID]], generate_pack_data: "GeneratePackDataFunc", progress: Callable[[bytes], None] | None = None, ) -> SendPackResult: """Upload is not supported for bundle files.""" raise NotImplementedError("Bundle files are read-only") def fetch( self, path: bytes | str, target: BaseRepo, determine_wants: "DetermineWantsFunc | None" = None, progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Fetch into a target repository from a bundle file.""" bundle = self._open_bundle(path) # Get references from bundle refs = dict(bundle.references) # Determine what we want to fetch if determine_wants is None: _ = list(refs.values()) else: _ = determine_wants(refs, None) # Add pack data to target repository # Need to reopen the file for pack data access with open(path, "rb") as pack_file: # Skip to pack data section assert bundle.version is not None BundleClient._skip_to_pack_data(pack_file, bundle.version) # Read pack data into memory to avoid file positioning issues pack_bytes = pack_file.read() # Create PackData from in-memory bytes from io import BytesIO pack_io = BytesIO(pack_bytes) pack_data = PackData.from_file(pack_io, object_format=DEFAULT_OBJECT_FORMAT) try: target.object_store.add_pack_data(len(pack_data), pack_data.iter_unpacked()) finally: pack_data.close() # Apply ref filtering if specified if ref_prefix: filtered_refs = {} for ref_name, ref_value in refs.items(): for prefix in ref_prefix: if ref_name.startswith(prefix): filtered_refs[ref_name] = ref_value break refs = filtered_refs return FetchPackResult(_to_optional_dict(refs), {}, agent_string()) def fetch_pack( self, path: str | bytes, determine_wants: "DetermineWantsFunc", graph_walker: GraphWalker, pack_data: Callable[[bytes], int], progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Retrieve a pack from a bundle file.""" bundle = self._open_bundle(path) # Get references from bundle refs = dict(bundle.references) # Determine what we want to fetch try: _ = determine_wants(refs, depth) except TypeError: # Old-style determine_wants that doesn't accept depth _ = determine_wants(refs) # Write pack data to the callback # Need to reopen the file for pack data access with open(path, "rb") as pack_file: # Skip to pack data section assert bundle.version is not None BundleClient._skip_to_pack_data(pack_file, bundle.version) # Read pack data and write it to the callback pack_bytes = pack_file.read() pack_data(pack_bytes) # Apply ref filtering if specified if ref_prefix: filtered_refs = {} for ref_name, ref_value in refs.items(): for prefix in ref_prefix: if ref_name.startswith(prefix): filtered_refs[ref_name] = ref_value break refs = filtered_refs return FetchPackResult(_to_optional_dict(refs), {}, agent_string()) def get_refs( self, path: str | bytes, protocol_version: int | None = None, ref_prefix: Sequence[bytes] | None = None, ) -> LsRemoteResult: """Retrieve the current refs from a bundle file.""" bundle = self._open_bundle(path) refs = dict(bundle.references) # Apply ref filtering if specified if ref_prefix: filtered_refs = {} for ref_name, ref_value in refs.items(): for prefix in ref_prefix: if ref_name.startswith(prefix): filtered_refs[ref_name] = ref_value break refs = filtered_refs # Bundle refs are always concrete (never None), but LsRemoteResult expects Optional return LsRemoteResult(_to_optional_dict(refs), {}) # What Git client to use for local access default_local_git_client_cls = LocalGitClient class SSHVendor: """A client side SSH implementation.""" def run_command( self, host: str, command: bytes, username: str | None = None, port: int | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, protocol_version: int | None = None, ) -> SubprocessWrapper: """Connect to an SSH server. Run a command remotely and return a file-like object for interaction with the remote command. Args: host: Host name command: Command to run (as argv array) username: Optional ame of user to log in as port: Optional SSH port to use password: Optional ssh password for login or private key key_filename: Optional path to private keyfile ssh_command: Optional SSH command protocol_version: Desired Git protocol version. By default the highest mutually supported protocol version will be used. """ raise NotImplementedError(self.run_command) class StrangeHostname(Exception): """Refusing to connect to strange SSH hostname.""" def __init__(self, hostname: str) -> None: """Initialize StrangeHostname exception. Args: hostname: The strange hostname that was rejected """ super().__init__(hostname) class SubprocessSSHVendor(SSHVendor): """SSH vendor that shells out to the local 'ssh' command.""" def run_command( self, host: str, command: bytes, username: str | None = None, port: int | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, protocol_version: int | None = None, ) -> SubprocessWrapper: """Run a git command over SSH. Args: host: SSH host to connect to command: Git command to run username: Optional username port: Optional port number password: Optional password (not supported) key_filename: Optional SSH key file ssh_command: Optional custom SSH command protocol_version: Optional Git protocol version Returns: Tuple of (subprocess.Popen, Protocol, stderr_stream) """ if password is not None: raise NotImplementedError( "Setting password not supported by SubprocessSSHVendor." ) if ssh_command: import shlex args = [*shlex.split(ssh_command, posix=sys.platform != "win32"), "-x"] else: args = ["ssh", "-x"] if port: args.extend(["-p", str(port)]) if key_filename: args.extend(["-i", str(key_filename)]) if protocol_version is None: protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_FETCH if protocol_version > 0: args.extend(["-o", f"SetEnv GIT_PROTOCOL=version={protocol_version}"]) if username: host = f"{username}@{host}" if host.startswith("-"): raise StrangeHostname(hostname=host) args.append(host) proc = subprocess.Popen( [*args, command], bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) return SubprocessWrapper(proc) class PLinkSSHVendor(SSHVendor): """SSH vendor that shells out to the local 'plink' command.""" def run_command( self, host: str, command: bytes, username: str | None = None, port: int | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, protocol_version: int | None = None, ) -> SubprocessWrapper: """Run a git command over SSH using PLink. Args: host: SSH host to connect to command: Git command to run username: Optional username port: Optional port number password: Optional password key_filename: Optional SSH key file ssh_command: Optional custom SSH command protocol_version: Optional Git protocol version Returns: Tuple of (subprocess.Popen, Protocol, stderr_stream) """ if ssh_command: import shlex args = [*shlex.split(ssh_command, posix=sys.platform != "win32"), "-ssh"] elif sys.platform == "win32": args = ["plink.exe", "-ssh"] else: args = ["plink", "-ssh"] if password is not None: import warnings warnings.warn( "Invoking PLink with a password exposes the password in the " "process list." ) args.extend(["-pw", str(password)]) if port: args.extend(["-P", str(port)]) if key_filename: args.extend(["-i", str(key_filename)]) if username: host = f"{username}@{host}" if host.startswith("-"): raise StrangeHostname(hostname=host) args.append(host) # plink.exe does not provide a way to pass environment variables # via the command line. The best we can do is set an environment # variable and hope that plink will pass it to the server. If this # does not work then the server should behave as if we had requested # protocol version 0. env = copy.deepcopy(os.environ) if protocol_version is None: protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_FETCH if protocol_version > 0: env["GIT_PROTOCOL"] = f"version={protocol_version}" proc = subprocess.Popen( [*args, command], bufsize=0, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, ) return SubprocessWrapper(proc) # Can be overridden by users get_ssh_vendor: Callable[[], SSHVendor] = SubprocessSSHVendor class SSHGitClient(TraditionalGitClient): """Git client that connects over SSH.""" def __init__( self, host: str, port: int | None = None, username: str | None = None, vendor: SSHVendor | None = None, config: Config | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, path_encoding: str = TraditionalGitClient.DEFAULT_ENCODING, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Initialize SSHGitClient. Args: host: SSH hostname port: Optional SSH port username: Optional username vendor: Optional SSH vendor config: Optional configuration password: Optional password key_filename: Optional SSH key file ssh_command: Optional custom SSH command path_encoding: Encoding for paths (default: utf-8) thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress output include_tags: Send annotated tags when sending the objects they point to """ self.host = host self.port = port self.username = username self.password = password self.key_filename = key_filename # Priority: ssh_command parameter, then env vars, then core.sshCommand config if ssh_command: self.ssh_command = ssh_command else: # Check environment variables first env_ssh_command = os.environ.get("GIT_SSH_COMMAND") if env_ssh_command: self.ssh_command = env_ssh_command else: env_ssh = os.environ.get("GIT_SSH") if env_ssh: self.ssh_command = env_ssh else: # Fall back to config if no environment variable set if config is not None: try: config_ssh_command = config.get((b"core",), b"sshCommand") self.ssh_command = ( config_ssh_command.decode() if config_ssh_command else "ssh" ) except KeyError: self.ssh_command = "ssh" else: self.ssh_command = "ssh" super().__init__( path_encoding=path_encoding, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) self.alternative_paths: dict[bytes, bytes] = {} if vendor is not None: self.ssh_vendor = vendor else: self.ssh_vendor = get_ssh_vendor() def get_url(self, path: str) -> str: """Get the SSH URL for a path.""" netloc = self.host if self.port is not None: netloc += f":{self.port}" if self.username is not None: netloc = urlquote(self.username, "@/:") + "@" + netloc return urlunsplit(("ssh", netloc, path, "", "")) @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, path_encoding: str = TraditionalGitClient.DEFAULT_ENCODING, vendor: SSHVendor | None = None, key_filename: str | None = None, ssh_command: str | None = None, ) -> "SSHGitClient": """Create an SSHGitClient from a parsed URL. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb protocol (not used for SSHGitClient) username: SSH username password: SSH password config: Configuration object path_encoding: Encoding for paths vendor: SSH implementation to use key_filename: Optional SSH key file ssh_command: Optional custom SSH command Returns: An SSHGitClient instance """ if parsedurl.hostname is None: raise ValueError("SSH URL must have a hostname") return cls( host=parsedurl.hostname, port=parsedurl.port, username=username or parsedurl.username, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, path_encoding=path_encoding, vendor=vendor, config=config, password=password, key_filename=key_filename, ssh_command=ssh_command, ) def _get_cmd_path(self, cmd: bytes) -> bytes: cmd = self.alternative_paths.get(cmd, b"git-" + cmd) assert isinstance(cmd, bytes) return cmd def _connect( self, cmd: bytes, path: str | bytes, protocol_version: int | None = None, ) -> tuple[Protocol, Callable[[], bool], IO[bytes] | None]: if not isinstance(cmd, bytes): raise TypeError(cmd) if isinstance(path, bytes): path = path.decode(self._remote_path_encoding) if path.startswith("/~"): path = path[1:] argv = ( self._get_cmd_path(cmd) + b" '" + path.encode(self._remote_path_encoding) + b"'" ) kwargs = {} if self.password is not None: kwargs["password"] = self.password if self.key_filename is not None: kwargs["key_filename"] = self.key_filename # GIT_SSH_COMMAND takes precedence over GIT_SSH if self.ssh_command is not None: kwargs["ssh_command"] = self.ssh_command con = self.ssh_vendor.run_command( self.host, argv, port=self.port, username=self.username, protocol_version=protocol_version, **kwargs, ) return ( Protocol( con.read, con.write, con.close, report_activity=self._report_activity, ), con.can_read, getattr(con, "stderr", None), ) def default_user_agent_string() -> str: """Return the default user agent string for Dulwich.""" # Start user agent with "git/", because GitHub requires this. :-( See # https://github.com/jelmer/dulwich/issues/562 for details. return "git/dulwich/{}".format(".".join([str(x) for x in dulwich.__version__])) def _urlmatch_http_sections( config: Config, url: str | None ) -> Iterator[tuple[bytes, ...]]: """Yield http config sections matching the given URL, ordered by specificity. Yields sections from least specific to most specific, so callers can apply settings in order with more specific settings overriding less specific ones. Args: config: Git configuration object url: URL to match against config sections (if None, only yields global http section) Yields: Config section tuples that match the URL, ordered by specificity """ encoding = getattr(config, "encoding", None) or sys.getdefaultencoding() parsed_url = urlparse(url) if url else None # Collect all matching sections with their specificity # (specificity is based on URL path length - longer = more specific) matching_sections: list[tuple[int, tuple[bytes, ...]]] = [] for config_section in config.sections(): if config_section[0] != b"http": continue if len(config_section) < 2: # Global http section (least specific) matching_sections.append((0, config_section)) elif parsed_url is not None: # URL-specific http section - only match if we have a URL config_url = config_section[1].decode(encoding) parsed_config_url = urlparse(config_url) is_match = False if parsed_config_url.scheme and parsed_config_url.netloc: is_match = match_urls(parsed_url, parsed_config_url) else: is_match = match_partial_url(parsed_url, config_url) if is_match: # Calculate specificity based on URL path length specificity = len(parsed_config_url.path.rstrip("/")) matching_sections.append((specificity, config_section)) # Sort by specificity (least specific first) matching_sections.sort(key=lambda x: x[0]) for _, section in matching_sections: yield section class AuthCallbackPoolManager: """Pool manager wrapper that handles authentication callbacks.""" def __init__( self, pool_manager: "urllib3.PoolManager | urllib3.ProxyManager", auth_callback: Callable[[str, str, int], dict[str, str] | None] | None = None, proxy_auth_callback: Callable[[str, str, int], dict[str, str] | None] | None = None, ) -> None: self._pool_manager = pool_manager self._auth_callback = auth_callback self._proxy_auth_callback = proxy_auth_callback self._auth_attempts: dict[str, int] = {} def __getattr__(self, name: str): # type: ignore[no-untyped-def] # Delegate all other attributes to the wrapped pool manager return getattr(self._pool_manager, name) def request(self, method: str, url: str, *args, **kwargs): # type: ignore[no-untyped-def] """Make HTTP request with authentication callback support.""" max_attempts = 3 attempts = self._auth_attempts.get(url, 0) while attempts < max_attempts: response = self._pool_manager.request(method, url, *args, **kwargs) if response.status == 401 and self._auth_callback: # HTTP authentication required www_authenticate = response.headers.get("WWW-Authenticate", "") attempts += 1 self._auth_attempts[url] = attempts # Call the authentication callback credentials = self._auth_callback(url, www_authenticate, attempts) if credentials: # Update request with new credentials import urllib3.util auth_header = urllib3.util.make_headers( basic_auth=f"{credentials['username']}:{credentials.get('password', '')}" ) if "headers" in kwargs: kwargs["headers"].update(auth_header) else: kwargs["headers"] = auth_header # Retry the request continue elif response.status == 407 and self._proxy_auth_callback: # Proxy authentication required proxy_authenticate = response.headers.get("Proxy-Authenticate", "") attempts += 1 self._auth_attempts[url] = attempts # Call the proxy authentication callback credentials = self._proxy_auth_callback( url, proxy_authenticate, attempts ) if credentials: # Update request with new proxy credentials import urllib3.util proxy_auth_header = urllib3.util.make_headers( proxy_basic_auth=f"{credentials['username']}:{credentials.get('password', '')}" ) if "headers" in kwargs: kwargs["headers"].update(proxy_auth_header) else: kwargs["headers"] = proxy_auth_header # Retry the request continue # Clear attempts on success or non-auth failure if url in self._auth_attempts: del self._auth_attempts[url] return response # Max attempts reached return response def default_urllib3_manager( config: Config | None, pool_manager_cls: type | None = None, proxy_manager_cls: type | None = None, base_url: str | None = None, timeout: float | None = None, cert_reqs: str | None = None, auth_callback: Callable[[str, str, int], dict[str, str] | None] | None = None, proxy_auth_callback: Callable[[str, str, int], dict[str, str] | None] | None = None, ) -> "urllib3.ProxyManager | urllib3.PoolManager | AuthCallbackPoolManager": """Return urllib3 connection pool manager. Honour detected proxy configurations. Args: config: `dulwich.config.ConfigDict` instance with Git configuration. pool_manager_cls: Pool manager class to use proxy_manager_cls: Proxy manager class to use base_url: Base URL for proxy bypass checks timeout: Timeout for HTTP requests in seconds cert_reqs: SSL certificate requirements (e.g. "CERT_REQUIRED", "CERT_NONE") auth_callback: Optional callback for HTTP authentication proxy_auth_callback: Optional callback for proxy authentication Returns: Either pool_manager_cls (defaults to ``urllib3.ProxyManager``) instance for proxy configurations, proxy_manager_cls (defaults to ``urllib3.PoolManager``) instance otherwise. If auth callbacks are provided, returns an AuthCallbackPoolManager wrapper. """ proxy_server: str | None = None user_agent: str | None = None ca_certs: str | None = None ssl_verify: bool | None = None if proxy_server is None: for proxyname in ("https_proxy", "http_proxy", "all_proxy"): proxy_server = os.environ.get(proxyname) if proxy_server: break if proxy_server: if check_for_proxy_bypass(base_url): proxy_server = None if config is not None: # Iterate through all matching http sections from least to most specific # More specific settings will override less specific ones for section in _urlmatch_http_sections(config, base_url): if proxy_server is None: try: proxy_server_bytes = config.get(section, b"proxy") except KeyError: pass else: if proxy_server_bytes is not None: proxy_server = proxy_server_bytes.decode("utf-8") try: user_agent_bytes = config.get(section, b"useragent") except KeyError: pass else: if user_agent_bytes is not None: user_agent = user_agent_bytes.decode("utf-8") try: ssl_verify_value = config.get_boolean(section, b"sslVerify") except KeyError: pass else: if ssl_verify_value is not None: ssl_verify = ssl_verify_value try: ca_certs_bytes = config.get(section, b"sslCAInfo") except KeyError: pass else: if ca_certs_bytes is not None: ca_certs = ca_certs_bytes.decode("utf-8") if timeout is None: try: timeout_bytes = config.get(section, b"timeout") except KeyError: pass else: if timeout_bytes is not None: timeout = float(timeout_bytes.decode("utf-8")) # Default ssl_verify to True if not set if ssl_verify is None: ssl_verify = True if user_agent is None: user_agent = default_user_agent_string() headers = {"User-agent": user_agent} # Check for extra headers in config with URL matching if config is not None: # Apply extra headers from least specific to most specific for section in _urlmatch_http_sections(config, base_url): try: extra_headers = config.get_multivar(section, b"extraHeader") except KeyError: continue for extra_header in extra_headers: if not extra_header: logger.warning("Ignoring empty http.extraHeader value") continue if b": " not in extra_header: logger.warning( "Ignoring invalid http.extraHeader value %r (missing ': ' separator)", extra_header, ) continue # Parse the header (format: "Header-Name: value") header_name, header_value = extra_header.split(b": ", 1) try: headers[header_name.decode("utf-8")] = header_value.decode("utf-8") except UnicodeDecodeError as e: logger.warning( "Ignoring http.extraHeader with invalid UTF-8: %s", e ) kwargs: dict[str, str | float | None] = { "ca_certs": ca_certs, } # Add timeout if specified if timeout is not None: kwargs["timeout"] = timeout # Handle cert_reqs - allow override from parameter if cert_reqs is not None: kwargs["cert_reqs"] = cert_reqs elif ssl_verify is True: kwargs["cert_reqs"] = "CERT_REQUIRED" elif ssl_verify is False: kwargs["cert_reqs"] = "CERT_NONE" else: # Default to SSL verification kwargs["cert_reqs"] = "CERT_REQUIRED" import urllib3 # Check for proxy authentication method configuration proxy_auth_method: str | None = None if config is not None: try: proxy_auth_method_bytes = config.get(b"http", b"proxyAuthMethod") if proxy_auth_method_bytes and isinstance(proxy_auth_method_bytes, bytes): proxy_auth_method = proxy_auth_method_bytes.decode().lower() except KeyError: pass # Check environment variable override env_proxy_auth = os.environ.get("GIT_HTTP_PROXY_AUTHMETHOD") if env_proxy_auth: proxy_auth_method = env_proxy_auth.lower() base_manager: urllib3.ProxyManager | urllib3.PoolManager if proxy_server is not None: if proxy_manager_cls is None: proxy_manager_cls = urllib3.ProxyManager if not isinstance(proxy_server, str): proxy_server = proxy_server.decode() proxy_server_url = urlparse(proxy_server) # Validate proxy auth method if specified if proxy_auth_method and proxy_auth_method not in ("anyauth", "basic"): # Only basic and anyauth are currently supported # Other methods like digest, negotiate, ntlm would require additional libraries raise NotImplementedError( f"Proxy authentication method '{proxy_auth_method}' is not supported. " "Only 'basic' and 'anyauth' are currently supported." ) if proxy_server_url.username is not None: proxy_headers = urllib3.make_headers( proxy_basic_auth=f"{proxy_server_url.username}:{proxy_server_url.password or ''}" ) else: proxy_headers = {} base_manager = proxy_manager_cls( proxy_server, proxy_headers=proxy_headers, headers=headers, **kwargs ) else: if pool_manager_cls is None: pool_manager_cls = urllib3.PoolManager base_manager = pool_manager_cls(headers=headers, **kwargs) # Wrap with AuthCallbackPoolManager if callbacks are provided if auth_callback is not None or proxy_auth_callback is not None: return AuthCallbackPoolManager( base_manager, auth_callback=auth_callback, proxy_auth_callback=proxy_auth_callback, ) return base_manager def check_for_proxy_bypass(base_url: str | None) -> bool: """Check if proxy should be bypassed for the given URL.""" # Check if a proxy bypass is defined with the no_proxy environment variable if base_url: # only check if base_url is provided no_proxy_str = os.environ.get("no_proxy") if no_proxy_str: # implementation based on curl behavior: https://curl.se/libcurl/c/CURLOPT_NOPROXY.html # get hostname of provided parsed url parsed_url = urlparse(base_url) hostname = parsed_url.hostname if hostname: import ipaddress # check if hostname is an ip address try: hostname_ip = ipaddress.ip_address(hostname) except ValueError: hostname_ip = None no_proxy_values = no_proxy_str.split(",") for no_proxy_value in no_proxy_values: no_proxy_value = no_proxy_value.strip() if no_proxy_value: no_proxy_value = no_proxy_value.lower() no_proxy_value = no_proxy_value.lstrip( "." ) # ignore leading dots if hostname_ip: # check if no_proxy_value is a ip network try: no_proxy_value_network = ipaddress.ip_network( no_proxy_value, strict=False ) except ValueError: no_proxy_value_network = None if no_proxy_value_network: # if hostname is a ip address and no_proxy_value is a ip network -> check if ip address is part of network if hostname_ip in no_proxy_value_network: return True if no_proxy_value == "*": # '*' is special case for always bypass proxy return True if hostname == no_proxy_value: return True no_proxy_value = ( "." + no_proxy_value ) # add a dot to only match complete domains if hostname.endswith(no_proxy_value): return True return False class AbstractHttpGitClient(GitClient): """Abstract base class for HTTP Git Clients. This is agonistic of the actual HTTP implementation. Subclasses should provide an implementation of the _http_request method. """ def __init__( self, base_url: str, dumb: bool = False, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, username: str | None = None, password: str | None = None, ) -> None: """Initialize AbstractHttpGitClient.""" self._base_url = base_url.rstrip("/") + "/" self._username = username self._password = password # Track original URL with credentials (set by from_parsedurl when credentials come from URL) self._url_with_auth: str | None = None self.dumb = dumb GitClient.__init__( self, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) def _http_request( self, url: str, headers: dict[str, str] | None = None, data: bytes | Iterator[bytes] | None = None, raise_for_status: bool = True, ) -> tuple["HTTPResponse", Callable[[int], bytes]]: """Perform HTTP request. Args: url: Request URL. headers: Optional custom headers to override defaults. data: Request data. raise_for_status: Whether to raise an exception for HTTP errors. Returns: Tuple (response, read), where response is an urllib3 response object with additional content_type and redirect_location properties, and read is a consumable read method for the response data. Raises: GitProtocolError """ raise NotImplementedError(self._http_request) def _discover_references( self, service: bytes, base_url: str, protocol_version: int | None = None, ref_prefix: Sequence[bytes] | None = None, ) -> tuple[ dict[Ref, ObjectID | None], set[bytes], str, dict[Ref, Ref], dict[Ref, ObjectID], ]: if ( protocol_version is not None and protocol_version not in GIT_PROTOCOL_VERSIONS ): raise ValueError(f"unknown Git protocol version {protocol_version}") assert base_url[-1] == "/" tail = "info/refs" headers = {"Accept": "*/*"} if self.dumb is not True: tail += "?service={}".format(service.decode("ascii")) # Enable protocol v2 only when fetching, not when pushing. # Git does not yet implement push over protocol v2, and as of # git version 2.37.3 git-http-backend's behaviour is erratic if # we try: It responds with a Git-protocol-v1-style ref listing # which lacks the "001f# service=git-receive-pack" marker. if service == b"git-upload-pack": if protocol_version is None: self.protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_FETCH else: self.protocol_version = protocol_version if self.protocol_version == 2: headers["Git-Protocol"] = "version=2" else: self.protocol_version = DEFAULT_GIT_PROTOCOL_VERSION_SEND url = urljoin(base_url, tail) resp, read = self._http_request(url, headers) if resp.redirect_location: # Something changed (redirect!), so let's update the base URL if not resp.redirect_location.endswith(tail): raise GitProtocolError( f"Redirected from URL {url} to URL {resp.redirect_location} without {tail}" ) base_url = urljoin(url, resp.redirect_location[: -len(tail)]) try: self.dumb = resp.content_type is None or not resp.content_type.startswith( "application/x-git-" ) if not self.dumb: def begin_protocol_v2( proto: Protocol, ) -> tuple[set[bytes], Any, Callable[[int], bytes], Protocol]: nonlocal ref_prefix server_capabilities = read_server_capabilities(proto.read_pkt_seq()) if ref_prefix is None: ref_prefix = DEFAULT_REF_PREFIX pkts = [ b"symrefs", b"peel", ] for prefix in ref_prefix: pkts.append(b"ref-prefix " + prefix) body = b"".join( [pkt_line(b"command=ls-refs\n"), b"0001", pkt_seq(*pkts)] ) resp, read = self._smart_request( service.decode("ascii"), base_url, body ) proto = Protocol(read, lambda data: None) return server_capabilities, resp, read, proto proto = Protocol(read, lambda data: None) server_protocol_version = negotiate_protocol_version(proto) if server_protocol_version not in GIT_PROTOCOL_VERSIONS: raise ValueError( f"unknown Git protocol version {server_protocol_version} used by server" ) if protocol_version and server_protocol_version > protocol_version: raise ValueError( f"bad Git protocol version {server_protocol_version} used by server" ) self.protocol_version = server_protocol_version if self.protocol_version == 2: server_capabilities, resp, read, proto = begin_protocol_v2(proto) (refs, symrefs, peeled) = read_pkt_refs_v2(proto.read_pkt_seq()) return refs, server_capabilities, base_url, symrefs, peeled else: try: [pkt] = list(proto.read_pkt_seq()) except ValueError as exc: raise GitProtocolError( "unexpected number of packets received" ) from exc if pkt.rstrip(b"\n") != (b"# service=" + service): raise GitProtocolError( f"unexpected first line {pkt!r} from smart server" ) # Github sends "version 2" after sending the service name. # Try to negotiate protocol version 2 again. server_protocol_version = negotiate_protocol_version(proto) if server_protocol_version not in GIT_PROTOCOL_VERSIONS: raise ValueError( f"unknown Git protocol version {server_protocol_version} used by server" ) if protocol_version and server_protocol_version > protocol_version: raise ValueError( f"bad Git protocol version {server_protocol_version} used by server" ) self.protocol_version = server_protocol_version if self.protocol_version == 2: server_capabilities, resp, read, proto = begin_protocol_v2( proto ) (refs, symrefs, peeled) = read_pkt_refs_v2(proto.read_pkt_seq()) else: ( refs_v1, server_capabilities, ) = read_pkt_refs_v1(proto.read_pkt_seq()) # Convert v1 refs to Optional type refs = _to_optional_dict(refs_v1) # TODO: split_peeled_refs should accept Optional values (refs, peeled) = split_peeled_refs(refs) # type: ignore[arg-type,assignment] (symrefs, _agent) = _extract_symrefs_and_agent( server_capabilities ) if ref_prefix is not None: refs = filter_ref_prefix(refs, ref_prefix) return refs, server_capabilities, base_url, symrefs, peeled else: self.protocol_version = 0 # dumb servers only support protocol v0 # Read all the response data data = b"" while True: chunk = read(4096) if not chunk: break data += chunk from typing import cast info_refs = read_info_refs(BytesIO(data)) (refs_nonopt, peeled) = split_peeled_refs(info_refs) if ref_prefix is not None: refs_nonopt = filter_ref_prefix(refs_nonopt, ref_prefix) refs_result: dict[Ref, ObjectID | None] = cast( dict[Ref, ObjectID | None], refs_nonopt ) return refs_result, set(), base_url, {}, peeled finally: resp.close() def _smart_request( self, service: str, url: str, data: bytes | Iterator[bytes] ) -> tuple["HTTPResponse", Callable[[int], bytes]]: """Send a 'smart' HTTP request. This is a simple wrapper around _http_request that sets a couple of extra headers. """ assert url[-1] == "/" url = urljoin(url, service) result_content_type = f"application/x-{service}-result" headers = { "Content-Type": f"application/x-{service}-request", "Accept": result_content_type, } if self.protocol_version == 2: headers["Git-Protocol"] = "version=2" if isinstance(data, bytes): headers["Content-Length"] = str(len(data)) resp, read = self._http_request(url, headers, data) if ( not resp.content_type or resp.content_type.split(";")[0] != result_content_type ): raise GitProtocolError( f"Invalid content-type from server: {resp.content_type}" ) return resp, read def send_pack( self, path: str | bytes, update_refs: Callable[[dict[Ref, ObjectID]], dict[Ref, ObjectID]], generate_pack_data: "GeneratePackDataFunc", progress: Callable[[bytes], None] | None = None, ) -> SendPackResult: """Upload a pack to a remote repository. Args: path: Repository path (as bytestring or string) update_refs: Function to determine changes to remote refs. Receives dict with existing remote refs, returns dict with changed refs (name -> sha, where sha=ZERO_SHA for deletions) generate_pack_data: Function that can return a tuple with number of elements and pack data to upload. progress: Optional progress function Returns: SendPackResult Raises: SendPackError: if server rejects the pack data """ url = self._get_url(path) old_refs, server_capabilities, url, _symrefs, _peeled = ( self._discover_references(b"git-receive-pack", url) ) ( negotiated_capabilities, agent, ) = self._negotiate_receive_pack_capabilities(server_capabilities) negotiated_capabilities.add(capability_agent()) if CAPABILITY_REPORT_STATUS in negotiated_capabilities: self._report_status_parser = ReportStatusParser() # Assert that old_refs has no None values assert all(v is not None for v in old_refs.values()), ( "old_refs should not contain None values" ) old_refs_typed: dict[Ref, ObjectID] = old_refs # type: ignore[assignment] new_refs = update_refs(dict(old_refs_typed)) if new_refs is None: # Determine wants function is aborting the push. # Convert to Optional type for SendPackResult return SendPackResult( _to_optional_dict(old_refs_typed), agent=agent, ref_status={} ) if set(new_refs.items()).issubset(set(old_refs_typed.items())): # Convert to Optional type for SendPackResult return SendPackResult( _to_optional_dict(new_refs), agent=agent, ref_status={} ) if self.dumb: raise NotImplementedError(self.fetch_pack) def body_generator() -> Iterator[bytes]: header_handler = _v1ReceivePackHeader( list(negotiated_capabilities), old_refs_typed, new_refs ) for pkt in header_handler: yield pkt_line(pkt) pack_data_count, pack_data = generate_pack_data( header_handler.have, header_handler.want, ofs_delta=(CAPABILITY_OFS_DELTA in negotiated_capabilities), progress=progress, ) if self._should_send_pack(new_refs): yield from PackChunkGenerator( # TODO: Don't hardcode object format num_records=pack_data_count, records=pack_data, object_format=DEFAULT_OBJECT_FORMAT, ) resp, read = self._smart_request("git-receive-pack", url, data=body_generator()) try: resp_proto = Protocol(read, lambda data: None) ref_status = self._handle_receive_pack_tail( resp_proto, negotiated_capabilities, progress ) # Convert to Optional type for SendPackResult return SendPackResult( _to_optional_dict(new_refs), agent=agent, ref_status=ref_status ) finally: resp.close() def fetch_pack( self, path: str | bytes, determine_wants: "DetermineWantsFunc", graph_walker: GraphWalker, pack_data: Callable[[bytes], int], progress: Callable[[bytes], None] | None = None, depth: int | None = None, ref_prefix: Sequence[bytes] | None = None, filter_spec: bytes | None = None, protocol_version: int | None = None, shallow_since: str | None = None, shallow_exclude: list[str] | None = None, ) -> FetchPackResult: """Retrieve a pack from a git smart server. Args: path: Path to fetch from determine_wants: Callback that returns list of commits to fetch graph_walker: Object with next() and ack(). pack_data: Callback called for each bit of data in the pack progress: Callback for progress reports (strings) depth: Depth for request ref_prefix: List of prefixes of desired references, as a list of bytestrings. Filtering is done by the server if supported, and client side otherwise. filter_spec: A git-rev-list-style object filter spec, as bytestring. Only used if the server supports the Git protocol-v2 'filter' feature, and ignored otherwise. protocol_version: Desired Git protocol version. By default the highest mutually supported protocol version will be used. shallow_since: Deepen the history to include commits after this date shallow_exclude: Deepen the history to exclude commits reachable from these refs Returns: FetchPackResult object """ url = self._get_url(path) refs, server_capabilities, url, symrefs, _peeled = self._discover_references( b"git-upload-pack", url, protocol_version=protocol_version, ref_prefix=ref_prefix, ) ( negotiated_capabilities, capa_symrefs, agent, ) = self._negotiate_upload_pack_capabilities(server_capabilities) object_format = extract_object_format_from_capabilities(server_capabilities) if not symrefs and capa_symrefs: symrefs = capa_symrefs # Filter out None values from refs for determine_wants refs_filtered = {k: v for k, v in refs.items() if v is not None} if depth is not None: wants = determine_wants(refs_filtered, depth=depth) else: wants = determine_wants(refs_filtered) if wants is not None: wants = [cid for cid in wants if cid != ZERO_SHA] if not wants and not self.dumb: return FetchPackResult(refs, symrefs, agent, object_format=object_format) elif self.dumb: # Use dumb HTTP protocol from .dumb import DumbRemoteHTTPRepo # Pass http_request function dumb_repo = DumbRemoteHTTPRepo( url, functools.partial(self._http_request, raise_for_status=False) ) # Fetch pack data from dumb remote pack_data_list = list( dumb_repo.fetch_pack_data( lambda refs, depth: wants, graph_walker, progress=progress, depth=depth, ) ) head = dumb_repo.get_head() if head is not None: symrefs[HEADREF] = head # Write pack data if pack_data_list: from .pack import write_pack_data # Wrap pack_data to match expected signature def write_fn(data: bytes) -> None: pack_data(data) # Write pack data directly using the unpacked objects write_pack_data( write_fn, iter(pack_data_list), num_records=len(pack_data_list), progress=progress, object_format=DEFAULT_OBJECT_FORMAT, ) return FetchPackResult(refs, symrefs, agent, object_format=object_format) req_data = BytesIO() req_proto = Protocol(None, req_data.write) # type: ignore (new_shallow, new_unshallow) = _handle_upload_pack_head( req_proto, negotiated_capabilities, graph_walker, wants, can_read=None, depth=depth, protocol_version=self.protocol_version, shallow_since=shallow_since, shallow_exclude=shallow_exclude, ) if self.protocol_version == 2: data = pkt_line(b"command=fetch\n") + b"0001" if CAPABILITY_THIN_PACK in self._fetch_capabilities: data += pkt_line(b"thin-pack\n") if ( find_capability( negotiated_capabilities, CAPABILITY_FETCH, CAPABILITY_FILTER ) and filter_spec ): data += pkt_line(b"filter %s\n" % filter_spec) elif filter_spec: self._warn_filter_objects() data += req_data.getvalue() else: if filter_spec: self._warn_filter_objects() data = req_data.getvalue() resp, read = self._smart_request("git-upload-pack", url, data) try: resp_proto = Protocol(read, None) # type: ignore if new_shallow is None and new_unshallow is None: (new_shallow, new_unshallow) = _read_shallow_updates( resp_proto.read_pkt_seq() ) _handle_upload_pack_tail( resp_proto, negotiated_capabilities, graph_walker, pack_data, progress, protocol_version=self.protocol_version, ) return FetchPackResult( refs, symrefs, agent, new_shallow, new_unshallow, object_format ) finally: resp.close() def get_refs( self, path: str | bytes, protocol_version: int | None = None, ref_prefix: Sequence[bytes] | None = None, ) -> LsRemoteResult: """Retrieve the current refs from a git smart server.""" url = self._get_url(path) refs, server_capabilities, _, symrefs, peeled = self._discover_references( b"git-upload-pack", url, protocol_version=protocol_version, ref_prefix=ref_prefix, ) object_format = extract_object_format_from_capabilities(server_capabilities) for refname, refvalue in peeled.items(): refs[Ref(refname + PEELED_TAG_SUFFIX)] = refvalue return LsRemoteResult(refs, symrefs, object_format=object_format) def get_url(self, path: str) -> str: """Get the HTTP URL for a path.""" url = self._get_url(path).rstrip("/") # Include credentials in the URL only if they came from a URL (not passed explicitly) # This preserves credentials that were in the original URL for git config storage if self._url_with_auth is not None: from urllib.parse import quote, urlparse, urlunparse assert self._username is not None parsed = urlparse(url) # Construct netloc with credentials if self._password is not None: netloc = f"{quote(self._username, safe='')}:{quote(self._password, safe='')}@{parsed.hostname}" else: netloc = f"{quote(self._username, safe='')}@{parsed.hostname}" if parsed.port: netloc += f":{parsed.port}" # Reconstruct URL with credentials url = urlunparse( ( parsed.scheme, netloc, parsed.path, parsed.params, parsed.query, parsed.fragment, ) ) return url def _get_url(self, path: str | bytes) -> str: path_str = path if isinstance(path, str) else path.decode("utf-8") return urljoin(self._base_url, path_str).rstrip("/") + "/" @classmethod def from_parsedurl( cls, parsedurl: ParseResult, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, dumb: bool = False, username: str | None = None, password: str | None = None, config: Config | None = None, pool_manager: "urllib3.PoolManager | None" = None, ) -> "AbstractHttpGitClient": """Create an AbstractHttpGitClient from a parsed URL. Args: parsedurl: Result of urlparse() thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress progress output include_tags: Whether to include tags dumb: Whether to use dumb HTTP transport username: Optional username for authentication password: Optional password for authentication config: Configuration object pool_manager: Optional urllib3 PoolManager for HTTP(S) connections Returns: An AbstractHttpGitClient instance """ # Extract credentials from URL if present # ParseResult.username and .password are URL-encoded, need to unquote them from urllib.parse import unquote url_username = unquote(parsedurl.username) if parsedurl.username else None url_password = unquote(parsedurl.password) if parsedurl.password else None # Explicit parameters take precedence over URL credentials final_username = username if username is not None else url_username final_password = password if password is not None else url_password # Remove credentials from URL for base_url hostname = parsedurl.hostname or "" base_parsed = parsedurl._replace(netloc=hostname) if parsedurl.port: base_parsed = base_parsed._replace(netloc=f"{hostname}:{parsedurl.port}") # Pass credentials to constructor if it's a subclass that supports them if issubclass(cls, Urllib3HttpGitClient): client: AbstractHttpGitClient = cls( urlunparse(base_parsed), dumb=dumb, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, username=final_username, password=final_password, config=config, pool_manager=pool_manager, ) else: # Base class now supports credentials in constructor client = cls( urlunparse(base_parsed), dumb=dumb, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, username=final_username, password=final_password, ) # Mark that credentials came from URL (not passed explicitly) if URL had credentials if url_username is not None or url_password is not None: client._url_with_auth = urlunparse(parsedurl) return client def __repr__(self) -> str: """Return string representation of this client.""" return f"{type(self).__name__}({self._base_url!r}, dumb={self.dumb!r})" def _wrap_urllib3_exceptions( func: Callable[..., bytes], ) -> Callable[..., bytes]: from urllib3.exceptions import ProtocolError def wrapper(*args: object, **kwargs: object) -> bytes: try: return func(*args, **kwargs) except ProtocolError as error: raise GitProtocolError(str(error)) from error return wrapper class Urllib3HttpGitClient(AbstractHttpGitClient): """HTTP Git client using urllib3. Supports callback-based authentication for both HTTP and proxy authentication, allowing dynamic credential handling without intercepting exceptions. Example: >>> def auth_callback(url, www_authenticate, attempt): ... # Parse www_authenticate header to determine auth scheme ... # Return credentials or None to cancel ... return {"username": "user", "password": "pass"} >>> >>> client = Urllib3HttpGitClient( ... "https://github.com/private/repo.git", ... auth_callback=auth_callback ... ) """ pool_manager: "urllib3.PoolManager | urllib3.ProxyManager | AuthCallbackPoolManager" def __init__( self, base_url: str, dumb: bool | None = None, pool_manager: "urllib3.PoolManager | urllib3.ProxyManager | AuthCallbackPoolManager | None" = None, config: Config | None = None, username: str | None = None, password: str | None = None, timeout: float | None = None, extra_headers: dict[str, str] | None = None, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, auth_callback: Callable[[str, str, int], dict[str, str] | None] | None = None, proxy_auth_callback: Callable[[str, str, int], dict[str, str] | None] | None = None, ) -> None: """Initialize Urllib3HttpGitClient.""" self._timeout = timeout self._extra_headers = extra_headers or {} self._auth_callback = auth_callback self._proxy_auth_callback = proxy_auth_callback if pool_manager is None: self.pool_manager = default_urllib3_manager( config, base_url=base_url, timeout=timeout, auth_callback=auth_callback, proxy_auth_callback=proxy_auth_callback, ) else: # Use provided pool manager as-is # If you want callbacks with a custom pool manager, wrap it yourself self.pool_manager = pool_manager if username is not None: # No escaping needed: ":" is not allowed in username: # https://tools.ietf.org/html/rfc2617#section-2 credentials = f"{username}:{password or ''}" import urllib3.util basic_auth = urllib3.util.make_headers(basic_auth=credentials) self.pool_manager.headers.update(basic_auth) # type: ignore self.config = config super().__init__( base_url=base_url, dumb=dumb if dumb is not None else False, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, username=username, password=password, ) def _get_url(self, path: str | bytes) -> str: if not isinstance(path, str): # urllib3.util.url._encode_invalid_chars() converts the path back # to bytes using the utf-8 codec. path = path.decode("utf-8") return urljoin(self._base_url, path).rstrip("/") + "/" def _http_request( self, url: str, headers: dict[str, str] | None = None, data: bytes | Iterator[bytes] | None = None, raise_for_status: bool = True, ) -> tuple["HTTPResponse", Callable[[int], bytes]]: import urllib3.exceptions req_headers = dict(self.pool_manager.headers) if headers is not None: req_headers.update(headers) req_headers["Pragma"] = "no-cache" try: request_kwargs = { "headers": req_headers, "preload_content": False, } if self._timeout is not None: request_kwargs["timeout"] = self._timeout if data is None: resp = self.pool_manager.request("GET", url, **request_kwargs) # type: ignore[arg-type] else: request_kwargs["body"] = data resp = self.pool_manager.request("POST", url, **request_kwargs) # type: ignore[arg-type] except urllib3.exceptions.HTTPError as e: raise GitProtocolError(str(e)) from e if raise_for_status: if resp.status == 404: raise NotGitRepository if resp.status == 401: raise HTTPUnauthorized(resp.headers.get("WWW-Authenticate"), url) if resp.status == 407: raise HTTPProxyUnauthorized(resp.headers.get("Proxy-Authenticate"), url) if resp.status != 200: raise GitProtocolError(f"unexpected http resp {resp.status} for {url}") resp.content_type = resp.headers.get("Content-Type") # type: ignore[union-attr] resp_url = resp.geturl() resp.redirect_location = resp_url if resp_url != url else "" # type: ignore[union-attr] return resp, _wrap_urllib3_exceptions(resp.read) # type: ignore[return-value] HttpGitClient = Urllib3HttpGitClient def _win32_url_to_path(parsed: ParseResult) -> str: """Convert a file: URL to a path. https://datatracker.ietf.org/doc/html/rfc8089 """ assert parsed.scheme == "file" _, netloc, path, _, _, _ = parsed if netloc == "localhost" or not netloc: netloc = "" elif ( netloc and len(netloc) >= 2 and netloc[0].isalpha() and netloc[1:2] in (":", ":/") ): # file://C:/foo.bar/baz or file://C://foo.bar//baz netloc = netloc[:2] else: raise NotImplementedError("Non-local file URLs are not supported") from nturl2path import url2pathname return url2pathname(netloc + path) def get_transport_and_path_from_url( url: str, config: Config | None = None, operation: str | None = None, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, username: str | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, pool_manager: "urllib3.PoolManager | None" = None, ) -> tuple[GitClient, str]: """Obtain a git client from a URL. Args: url: URL to open (a unicode string) config: Optional config object operation: Kind of operation that'll be performed; "pull" or "push" thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress output include_tags: Send annotated tags when sending the objects they point to username: Optional username for authentication password: Optional password for authentication key_filename: Optional SSH key file ssh_command: Optional custom SSH command pool_manager: Optional urllib3 PoolManager for HTTP(S) connections Returns: Tuple with client instance and relative path. """ if config is not None: url = apply_instead_of(config, url, push=(operation == "push")) return _get_transport_and_path_from_url( url, config=config, operation=operation, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, username=username, password=password, key_filename=key_filename, ssh_command=ssh_command, pool_manager=pool_manager, ) def _get_transport_and_path_from_url( url: str, config: Config | None, operation: str | None, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, username: str | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, pool_manager: "urllib3.PoolManager | None" = None, ) -> tuple[GitClient, str]: parsed = urlparse(url) if parsed.scheme == "git": return ( TCPGitClient.from_parsedurl( parsed, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), parsed.path, ) elif parsed.scheme in ("git+ssh", "ssh"): return SSHGitClient.from_parsedurl( parsed, config=config, username=username, password=password, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, key_filename=key_filename, ssh_command=ssh_command, ), parsed.path elif parsed.scheme in ("http", "https"): return ( HttpGitClient.from_parsedurl( parsed, config=config, username=username, password=password, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, pool_manager=pool_manager, ), parsed.path, ) elif parsed.scheme == "file": if sys.platform == "win32" or os.name == "nt": return default_local_git_client_cls( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), _win32_url_to_path(parsed) return ( default_local_git_client_cls.from_parsedurl( parsed, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), parsed.path, ) raise ValueError(f"unknown scheme '{parsed.scheme}'") def parse_rsync_url(location: str) -> tuple[str | None, str, str]: """Parse a rsync-style URL.""" if ":" in location and "@" not in location: # SSH with no user@, zero or one leading slash. (host, path) = location.split(":", 1) user = None elif ":" in location: # SSH with user@host:foo. user_host, path = location.split(":", 1) if "@" in user_host: user, host = user_host.rsplit("@", 1) else: user = None host = user_host else: raise ValueError("not a valid rsync-style URL") return (user, host, path) def get_transport_and_path( location: str, config: Config | None = None, operation: str | None = None, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, username: str | None = None, password: str | None = None, key_filename: str | None = None, ssh_command: str | None = None, pool_manager: "urllib3.PoolManager | None" = None, ) -> tuple[GitClient, str]: """Obtain a git client from a URL. Args: location: URL or path (a string) config: Optional config object operation: Kind of operation that'll be performed; "pull" or "push" thin_packs: Whether or not thin packs should be retrieved report_activity: Optional callback for reporting transport activity quiet: Whether to suppress output include_tags: Send annotated tags when sending the objects they point to username: Optional username for authentication password: Optional password for authentication key_filename: Optional SSH key file ssh_command: Optional custom SSH command pool_manager: Optional urllib3 PoolManager for HTTP(S) connections Returns: Tuple with client instance and relative path. """ if config is not None: location = apply_instead_of(config, location, push=(operation == "push")) # First, try to parse it as a URL try: return _get_transport_and_path_from_url( location, config=config, operation=operation, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, username=username, password=password, key_filename=key_filename, ssh_command=ssh_command, pool_manager=pool_manager, ) except ValueError: pass if sys.platform == "win32" and location[0].isalpha() and location[1:3] == ":\\": # Windows local path - but check if it's a bundle file first if BundleClient._is_bundle_file(location): return BundleClient( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), location return default_local_git_client_cls( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), location try: (rsync_username, hostname, path) = parse_rsync_url(location) except ValueError: # Check if it's a bundle file before assuming it's a local path if BundleClient._is_bundle_file(location): return BundleClient( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), location # Otherwise, assume it's a local path. return default_local_git_client_cls( thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), location else: return SSHGitClient( hostname, username=rsync_username or username, config=config, password=password, key_filename=key_filename, ssh_command=ssh_command, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ), path DEFAULT_GIT_CREDENTIALS_PATHS = [ os.path.expanduser("~/.git-credentials"), get_xdg_config_home_path("git", "credentials"), ] def get_credentials_from_store( scheme: str, hostname: str, username: str | None = None, fnames: list[str] = DEFAULT_GIT_CREDENTIALS_PATHS, ) -> Iterator[tuple[str, str]]: """Read credentials from a Git credential store.""" for fname in fnames: try: with open(fname, "rb") as f: for line in f: line_str = line.strip().decode("utf-8") parsed_line = urlparse(line_str) if ( parsed_line.scheme == scheme and parsed_line.hostname == hostname and (username is None or parsed_line.username == username) ): if parsed_line.username and parsed_line.password: yield parsed_line.username, parsed_line.password except FileNotFoundError: # If the file doesn't exist, try the next one. continue dulwich-1.0.0/dulwich/cloud/000077500000000000000000000000001513301442600157135ustar00rootroot00000000000000dulwich-1.0.0/dulwich/cloud/__init__.py000066400000000000000000000026261513301442600200320ustar00rootroot00000000000000# __init__.py -- Cloud storage backends for dulwich # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Cloud storage backends for dulwich. This package provides support for storing Git repositories in various cloud storage services. It includes implementations for different cloud providers that can be used as alternative storage backends to the traditional filesystem-based storage. Available backends: - GCS (Google Cloud Storage): Store Git objects in Google Cloud Storage buckets """ __all__ = [] dulwich-1.0.0/dulwich/cloud/gcs.py000066400000000000000000000076551513301442600170560ustar00rootroot00000000000000# object_store.py -- Object store for git objects # Copyright (C) 2021 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Storage of repositories on GCS.""" __all__ = [ "GcsObjectStore", ] import posixpath import tempfile from collections.abc import Iterator from typing import TYPE_CHECKING, BinaryIO from ..object_store import BucketBasedObjectStore from ..pack import ( PACK_SPOOL_FILE_MAX_SIZE, Pack, PackData, PackIndex, load_pack_index_file, ) if TYPE_CHECKING: from google.cloud.storage import Bucket # TODO(jelmer): For performance, read ranges? class GcsObjectStore(BucketBasedObjectStore): """Object store implementation using Google Cloud Storage.""" def __init__(self, bucket: "Bucket", subpath: str = "") -> None: """Initialize GCS object store. Args: bucket: GCS bucket instance subpath: Optional subpath within the bucket """ super().__init__() self.bucket = bucket self.subpath = subpath def __repr__(self) -> str: """Return string representation of GcsObjectStore.""" return f"{type(self).__name__}({self.bucket!r}, subpath={self.subpath!r})" def _remove_pack_by_name(self, name: str) -> None: self.bucket.delete_blobs( [posixpath.join(self.subpath, name) + "." + ext for ext in ["pack", "idx"]] ) def _iter_pack_names(self) -> Iterator[str]: packs: dict[str, set[str]] = {} for blob in self.bucket.list_blobs(prefix=self.subpath): name, ext = posixpath.splitext(posixpath.basename(blob.name)) packs.setdefault(name, set()).add(ext) for name, exts in packs.items(): if exts == {".pack", ".idx"}: yield name def _load_pack_data(self, name: str) -> PackData: b = self.bucket.blob(posixpath.join(self.subpath, name + ".pack")) from typing import cast from ..file import _GitFile with tempfile.SpooledTemporaryFile(max_size=PACK_SPOOL_FILE_MAX_SIZE) as f: b.download_to_file(f) f.seek(0) return PackData(name + ".pack", self.object_format, cast(_GitFile, f)) def _load_pack_index(self, name: str) -> PackIndex: b = self.bucket.blob(posixpath.join(self.subpath, name + ".idx")) with tempfile.SpooledTemporaryFile(max_size=PACK_SPOOL_FILE_MAX_SIZE) as f: b.download_to_file(f) f.seek(0) return load_pack_index_file(name + ".idx", f, self.object_format) def _get_pack(self, name: str) -> Pack: return Pack.from_lazy_objects( lambda: self._load_pack_data(name), lambda: self._load_pack_index(name), ) def _upload_pack( self, basename: str, pack_file: BinaryIO, index_file: BinaryIO ) -> None: idxblob = self.bucket.blob(posixpath.join(self.subpath, basename + ".idx")) datablob = self.bucket.blob(posixpath.join(self.subpath, basename + ".pack")) idxblob.upload_from_file(index_file) datablob.upload_from_file(pack_file) dulwich-1.0.0/dulwich/commit_graph.py000066400000000000000000000577231513301442600176460ustar00rootroot00000000000000# commit_graph.py -- Git commit graph file format support # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git commit graph file format support. Git's commit graph files store commit metadata and generation numbers for faster graph traversal operations like merge-base computation. The commit graph format is documented at: https://git-scm.com/docs/gitformat-commit-graph """ __all__ = [ "CHUNK_BASE_GRAPHS_LIST", "CHUNK_BLOOM_FILTER_DATA", "CHUNK_BLOOM_FILTER_INDEX", "CHUNK_COMMIT_DATA", "CHUNK_EXTRA_EDGE_LIST", "CHUNK_GENERATION_DATA", "CHUNK_GENERATION_DATA_OVERFLOW", "CHUNK_OID_FANOUT", "CHUNK_OID_LOOKUP", "COMMIT_GRAPH_SIGNATURE", "COMMIT_GRAPH_VERSION", "GENERATION_NUMBER_INFINITY", "GENERATION_NUMBER_V1_MAX", "GENERATION_NUMBER_ZERO", "GRAPH_EXTRA_EDGES_NEEDED", "GRAPH_LAST_EDGE", "GRAPH_PARENT_MISSING", "GRAPH_PARENT_NONE", "HASH_VERSION_SHA1", "HASH_VERSION_SHA256", "CommitGraph", "CommitGraphChunk", "CommitGraphEntry", "find_commit_graph_file", "generate_commit_graph", "get_reachable_commits", "read_commit_graph", "write_commit_graph", ] import os import struct from collections.abc import Iterator, Sequence from typing import TYPE_CHECKING, BinaryIO from .file import _GitFile from .object_format import ObjectFormat if TYPE_CHECKING: from .object_store import BaseObjectStore from .objects import Commit, ObjectID, RawObjectID, hex_to_sha, sha_to_hex # File format constants COMMIT_GRAPH_SIGNATURE = b"CGPH" COMMIT_GRAPH_VERSION = 1 HASH_VERSION_SHA1 = 1 HASH_VERSION_SHA256 = 2 # Chunk IDs CHUNK_OID_FANOUT = b"OIDF" CHUNK_OID_LOOKUP = b"OIDL" CHUNK_COMMIT_DATA = b"CDAT" CHUNK_GENERATION_DATA = b"GDA2" CHUNK_GENERATION_DATA_OVERFLOW = b"GDO2" CHUNK_EXTRA_EDGE_LIST = b"EDGE" CHUNK_BLOOM_FILTER_INDEX = b"BIDX" CHUNK_BLOOM_FILTER_DATA = b"BDAT" CHUNK_BASE_GRAPHS_LIST = b"BASE" # Generation number constants GENERATION_NUMBER_INFINITY = 0xFFFFFFFF GENERATION_NUMBER_ZERO = 0 GENERATION_NUMBER_V1_MAX = 0x3FFFFFFF # Parent encoding constants GRAPH_PARENT_MISSING = 0x70000000 GRAPH_PARENT_NONE = 0x70000000 GRAPH_EXTRA_EDGES_NEEDED = 0x80000000 GRAPH_LAST_EDGE = 0x80000000 class CommitGraphEntry: """Represents a single commit entry in the commit graph.""" def __init__( self, commit_id: ObjectID, tree_id: ObjectID, parents: list[ObjectID], generation: int, commit_time: int, ) -> None: """Initialize CommitGraphEntry. Args: commit_id: The commit object ID tree_id: The tree object ID parents: List of parent commit IDs generation: Generation number commit_time: Commit timestamp """ self.commit_id = commit_id self.tree_id = tree_id self.parents = parents self.generation = generation self.commit_time = commit_time def __repr__(self) -> str: """Return string representation of CommitGraphEntry.""" return ( f"CommitGraphEntry(commit_id={self.commit_id!r}, " f"tree_id={self.tree_id!r}, parents={self.parents!r}, " f"generation={self.generation}, commit_time={self.commit_time})" ) class CommitGraphChunk: """Represents a chunk in the commit graph file.""" def __init__(self, chunk_id: bytes, data: bytes) -> None: """Initialize CommitGraphChunk. Args: chunk_id: Chunk identifier data: Chunk data """ self.chunk_id = chunk_id self.data = data def __repr__(self) -> str: """Return string representation of CommitGraphChunk.""" return f"CommitGraphChunk(chunk_id={self.chunk_id!r}, size={len(self.data)})" class CommitGraph: """Git commit graph file reader/writer.""" def __init__(self, *, object_format: ObjectFormat | None = None) -> None: """Initialize CommitGraph. Args: object_format: Object format to use (defaults to SHA1) """ import warnings from .object_format import DEFAULT_OBJECT_FORMAT, SHA256 if object_format is None: warnings.warn( "CommitGraph() should be called with object_format parameter", DeprecationWarning, stacklevel=2, ) object_format = DEFAULT_OBJECT_FORMAT self.object_format = object_format self.hash_version = ( HASH_VERSION_SHA256 if object_format == SHA256 else HASH_VERSION_SHA1 ) self.chunks: dict[bytes, CommitGraphChunk] = {} self.entries: list[CommitGraphEntry] = [] self._oid_to_index: dict[RawObjectID, int] = {} @classmethod def from_file(cls, f: BinaryIO) -> "CommitGraph": """Read commit graph from file.""" return cls._read_from_file(f) @classmethod def _read_from_file(cls, f: BinaryIO) -> "CommitGraph": """Read commit graph data from file.""" # Read header signature = f.read(4) if signature != COMMIT_GRAPH_SIGNATURE: raise ValueError(f"Invalid commit graph signature: {signature!r}") version = struct.unpack(">B", f.read(1))[0] if version != COMMIT_GRAPH_VERSION: raise ValueError(f"Unsupported commit graph version: {version}") hash_version = struct.unpack(">B", f.read(1))[0] # Set object_format based on hash_version from file from .object_format import SHA1, SHA256 if hash_version == HASH_VERSION_SHA1: object_format = SHA1 elif hash_version == HASH_VERSION_SHA256: object_format = SHA256 else: raise ValueError(f"Unsupported hash version: {hash_version}") # Create instance with correct object_format graph = cls(object_format=object_format) graph.hash_version = hash_version num_chunks = struct.unpack(">B", f.read(1))[0] struct.unpack(">B", f.read(1))[0] # Read table of contents toc_entries = [] for _ in range(num_chunks + 1): # +1 for terminating entry chunk_id = f.read(4) offset = struct.unpack(">Q", f.read(8))[0] toc_entries.append((chunk_id, offset)) # Read chunks # Offsets in TOC are absolute from start of file for i in range(num_chunks): chunk_id, offset = toc_entries[i] next_offset = toc_entries[i + 1][1] chunk_size = next_offset - offset f.seek(offset) chunk_data = f.read(chunk_size) graph.chunks[chunk_id] = CommitGraphChunk(chunk_id, chunk_data) # Parse chunks graph._parse_chunks() return graph def _parse_chunks(self) -> None: """Parse chunk data into entries.""" if CHUNK_OID_LOOKUP not in self.chunks: raise ValueError("Missing required OID lookup chunk") if CHUNK_COMMIT_DATA not in self.chunks: raise ValueError("Missing required commit data chunk") # Parse OID lookup chunk oid_lookup_data = self.chunks[CHUNK_OID_LOOKUP].data num_commits = len(oid_lookup_data) // self.object_format.oid_length oids = [] for i in range(num_commits): start = i * self.object_format.oid_length end = start + self.object_format.oid_length oid = RawObjectID(oid_lookup_data[start:end]) oids.append(oid) self._oid_to_index[oid] = i # Parse commit data chunk commit_data = self.chunks[CHUNK_COMMIT_DATA].data expected_size = num_commits * (self.object_format.oid_length + 16) if len(commit_data) != expected_size: raise ValueError( f"Invalid commit data chunk size: {len(commit_data)}, expected {expected_size}" ) self.entries = [] for i in range(num_commits): offset = i * (self.object_format.oid_length + 16) # Tree OID tree_id = commit_data[offset : offset + self.object_format.oid_length] offset += self.object_format.oid_length # Parent positions (2 x 4 bytes) parent1_pos, parent2_pos = struct.unpack( ">LL", commit_data[offset : offset + 8] ) offset += 8 # Generation number and commit time (2 x 4 bytes) gen_and_time = struct.unpack(">LL", commit_data[offset : offset + 8]) generation = gen_and_time[0] >> 2 # Upper 30 bits commit_time = gen_and_time[1] | ( (gen_and_time[0] & 0x3) << 32 ) # 34 bits total # Parse parents parents = [] if parent1_pos < GRAPH_PARENT_MISSING: if parent1_pos >= len(oids): raise ValueError(f"Invalid parent1 position: {parent1_pos}") parents.append(oids[parent1_pos]) if parent2_pos < GRAPH_PARENT_MISSING: if parent2_pos >= len(oids): raise ValueError(f"Invalid parent2 position: {parent2_pos}") parents.append(oids[parent2_pos]) elif parent2_pos >= GRAPH_EXTRA_EDGES_NEEDED: # Handle extra edges (3+ parents) edge_offset = parent2_pos & ~GRAPH_EXTRA_EDGES_NEEDED parents.extend(self._parse_extra_edges(edge_offset, oids)) entry = CommitGraphEntry( commit_id=sha_to_hex(oids[i]), tree_id=sha_to_hex(RawObjectID(tree_id)), parents=[sha_to_hex(p) for p in parents], generation=generation, commit_time=commit_time, ) self.entries.append(entry) def _parse_extra_edges( self, offset: int, oids: Sequence[RawObjectID] ) -> list[RawObjectID]: """Parse extra parent edges for commits with 3+ parents.""" if CHUNK_EXTRA_EDGE_LIST not in self.chunks: return [] edge_data = self.chunks[CHUNK_EXTRA_EDGE_LIST].data parents = [] while offset + 4 <= len(edge_data): parent_pos = struct.unpack(">L", edge_data[offset : offset + 4])[0] offset += 4 if parent_pos & GRAPH_LAST_EDGE: parent_pos &= ~GRAPH_LAST_EDGE if parent_pos < len(oids): parents.append(oids[parent_pos]) break else: if parent_pos < len(oids): parents.append(oids[parent_pos]) return parents def get_entry_by_oid(self, oid: ObjectID) -> CommitGraphEntry | None: """Get commit graph entry by commit OID.""" # Convert hex ObjectID to binary if needed for lookup if isinstance(oid, bytes) and len(oid) == self.object_format.hex_length: # Input is hex ObjectID, convert to binary for internal lookup lookup_oid: RawObjectID = hex_to_sha(oid) else: # Input is already binary lookup_oid = RawObjectID(oid) index = self._oid_to_index.get(lookup_oid) if index is not None: return self.entries[index] return None def get_generation_number(self, oid: ObjectID) -> int | None: """Get generation number for a commit.""" entry = self.get_entry_by_oid(oid) return entry.generation if entry else None def get_parents(self, oid: ObjectID) -> list[ObjectID] | None: """Get parent commit IDs for a commit.""" entry = self.get_entry_by_oid(oid) return entry.parents if entry else None def write_to_file(self, f: BinaryIO | _GitFile) -> None: """Write commit graph to file.""" if not self.entries: raise ValueError("Cannot write empty commit graph") # Sort entries by commit ID for consistent output sorted_entries = sorted(self.entries, key=lambda e: e.commit_id) # Build OID lookup chunk oid_lookup_data = b"" for entry in sorted_entries: oid_lookup_data += hex_to_sha(entry.commit_id) # Build commit data chunk commit_data = b"" # Create OID to index mapping for parent lookups oid_to_index = {entry.commit_id: i for i, entry in enumerate(sorted_entries)} for entry in sorted_entries: # Tree OID (20 bytes) commit_data += hex_to_sha(entry.tree_id) # Parent positions (2 x 4 bytes) if len(entry.parents) == 0: parent1_pos = GRAPH_PARENT_MISSING parent2_pos = GRAPH_PARENT_MISSING elif len(entry.parents) == 1: parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING) parent2_pos = GRAPH_PARENT_MISSING elif len(entry.parents) == 2: parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING) parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING) else: # More than 2 parents - would need extra edge list chunk # For now, just store first two parents parent1_pos = oid_to_index.get(entry.parents[0], GRAPH_PARENT_MISSING) parent2_pos = oid_to_index.get(entry.parents[1], GRAPH_PARENT_MISSING) commit_data += struct.pack(">LL", parent1_pos, parent2_pos) # Generation and commit time (2 x 4 bytes) gen_and_time = (entry.generation << 2) | (entry.commit_time >> 32) commit_time_lower = entry.commit_time & 0xFFFFFFFF commit_data += struct.pack(">LL", gen_and_time, commit_time_lower) # Build fanout table fanout_data = b"" fanout_counts = [0] * 256 for i, entry in enumerate(sorted_entries): commit_oid_bytes = hex_to_sha(entry.commit_id) fanout_counts[commit_oid_bytes[0]] = i + 1 # Fill in gaps - each fanout entry should be cumulative for i in range(1, 256): if fanout_counts[i] == 0: fanout_counts[i] = fanout_counts[i - 1] for count in fanout_counts: fanout_data += struct.pack(">L", count) # Calculate chunk offsets header_size = ( 8 # signature + version + hash_version + num_chunks + base_graph_count ) toc_size = 4 * 12 # 4 entries (3 chunks + terminator) * 12 bytes each chunk1_offset = header_size + toc_size # OID Fanout chunk2_offset = chunk1_offset + len(fanout_data) # OID Lookup chunk3_offset = chunk2_offset + len(oid_lookup_data) # Commit Data terminator_offset = chunk3_offset + len(commit_data) # Write header f.write(COMMIT_GRAPH_SIGNATURE) f.write(struct.pack(">B", COMMIT_GRAPH_VERSION)) f.write(struct.pack(">B", self.hash_version)) f.write(struct.pack(">B", 3)) # 3 chunks f.write(struct.pack(">B", 0)) # 0 base graphs # Write table of contents f.write(CHUNK_OID_FANOUT + struct.pack(">Q", chunk1_offset)) f.write(CHUNK_OID_LOOKUP + struct.pack(">Q", chunk2_offset)) f.write(CHUNK_COMMIT_DATA + struct.pack(">Q", chunk3_offset)) f.write(b"\x00\x00\x00\x00" + struct.pack(">Q", terminator_offset)) # Write chunks f.write(fanout_data) f.write(oid_lookup_data) f.write(commit_data) def __len__(self) -> int: """Return number of commits in the graph.""" return len(self.entries) def __iter__(self) -> Iterator["CommitGraphEntry"]: """Iterate over commit graph entries.""" return iter(self.entries) def read_commit_graph(path: str | bytes) -> CommitGraph | None: """Read commit graph from file path.""" if isinstance(path, str): path = path.encode() if not os.path.exists(path): return None with open(path, "rb") as f: return CommitGraph.from_file(f) def find_commit_graph_file(git_dir: str | bytes) -> bytes | None: """Find commit graph file in a Git repository.""" if isinstance(git_dir, str): git_dir = git_dir.encode() # Standard location: .git/objects/info/commit-graph commit_graph_path = os.path.join(git_dir, b"objects", b"info", b"commit-graph") if os.path.exists(commit_graph_path): return commit_graph_path # Chain files in .git/objects/info/commit-graphs/ commit_graphs_dir = os.path.join(git_dir, b"objects", b"info", b"commit-graphs") if os.path.exists(commit_graphs_dir): # Look for graph-{hash}.graph files for filename in os.listdir(commit_graphs_dir): if filename.startswith(b"graph-") and filename.endswith(b".graph"): return os.path.join(commit_graphs_dir, filename) return None def generate_commit_graph( object_store: "BaseObjectStore", commit_ids: Sequence[ObjectID] ) -> CommitGraph: """Generate a commit graph from a set of commits. Args: object_store: Object store to retrieve commits from commit_ids: List of commit IDs to include in the graph Returns: CommitGraph object containing the specified commits """ graph = CommitGraph(object_format=object_store.object_format) if not commit_ids: return graph # Ensure all commit_ids are in the correct format for object store access hex_length = object_store.object_format.hex_length oid_length = object_store.object_format.oid_length normalized_commit_ids = [] for commit_id in commit_ids: if isinstance(commit_id, bytes) and len(commit_id) == hex_length: # Already hex ObjectID normalized_commit_ids.append(commit_id) elif isinstance(commit_id, bytes) and len(commit_id) == oid_length: # Binary SHA, convert to hex ObjectID normalized_commit_ids.append(sha_to_hex(RawObjectID(commit_id))) else: # Assume it's already correct format normalized_commit_ids.append(ObjectID(commit_id)) # Build a map of all commits and their metadata commit_map: dict[ObjectID, Commit] = {} for commit_id in normalized_commit_ids: try: commit_obj = object_store[commit_id] if commit_obj.type_name != b"commit": continue assert isinstance(commit_obj, Commit) commit_map[commit_id] = commit_obj except KeyError: # Commit not found, skip continue # Calculate generation numbers using topological sort generation_map: dict[bytes, int] = {} def calculate_generation(commit_id: ObjectID) -> int: if commit_id in generation_map: return generation_map[commit_id] if commit_id not in commit_map: # Unknown commit, assume generation 0 generation_map[commit_id] = 0 return 0 commit_obj = commit_map[commit_id] if not commit_obj.parents: # Root commit generation_map[commit_id] = 1 return 1 # Calculate based on parents max_parent_gen = 0 for parent_id in commit_obj.parents: parent_gen = calculate_generation(parent_id) max_parent_gen = max(max_parent_gen, parent_gen) generation = max_parent_gen + 1 generation_map[commit_id] = generation return generation # Calculate generation numbers for all commits for commit_id in commit_map: calculate_generation(commit_id) # Build commit graph entries for commit_id, commit_obj in commit_map.items(): # commit_id is already hex ObjectID from normalized_commit_ids commit_hex: ObjectID = commit_id # commit_obj.tree and commit_obj.parents are already ObjectIDs tree_hex = commit_obj.tree parents_hex: list[ObjectID] = commit_obj.parents entry = CommitGraphEntry( commit_id=commit_hex, tree_id=tree_hex, parents=parents_hex, generation=generation_map[commit_id], commit_time=commit_obj.commit_time, ) graph.entries.append(entry) # Build the OID to index mapping for lookups graph._oid_to_index = {} for i, entry in enumerate(graph.entries): # Convert hex ObjectID to binary RawObjectID for consistent lookup graph._oid_to_index[hex_to_sha(entry.commit_id)] = i return graph def write_commit_graph( git_dir: str | bytes, object_store: "BaseObjectStore", commit_ids: Sequence[ObjectID], ) -> None: """Write a commit graph file for the given commits. Args: git_dir: Git directory path object_store: Object store to retrieve commits from commit_ids: List of commit IDs to include in the graph """ if isinstance(git_dir, str): git_dir = git_dir.encode() # Generate the commit graph graph = generate_commit_graph(object_store, commit_ids) if not graph.entries: return # Nothing to write # Ensure the objects/info directory exists info_dir = os.path.join(git_dir, b"objects", b"info") os.makedirs(info_dir, exist_ok=True) # Write using GitFile for atomic operation from .file import GitFile graph_path = os.path.join(info_dir, b"commit-graph") with GitFile(graph_path, "wb") as f: graph.write_to_file(f) def get_reachable_commits( object_store: "BaseObjectStore", start_commits: Sequence[ObjectID] ) -> list[ObjectID]: """Get all commits reachable from the given starting commits. Args: object_store: Object store to retrieve commits from start_commits: List of starting commit IDs Returns: List of all reachable commit IDs (including the starting commits) """ visited: set[ObjectID] = set() reachable: list[ObjectID] = [] stack: list[ObjectID] = [] hex_length = object_store.object_format.hex_length oid_length = object_store.object_format.oid_length # Normalize commit IDs for object store access and tracking for commit_id in start_commits: if isinstance(commit_id, bytes) and len(commit_id) == hex_length: # Hex ObjectID - use directly for object store access if commit_id not in visited: stack.append(commit_id) elif isinstance(commit_id, bytes) and len(commit_id) == oid_length: # Binary SHA, convert to hex ObjectID for object store access hex_id = sha_to_hex(RawObjectID(commit_id)) if hex_id not in visited: stack.append(hex_id) else: # Assume it's already correct format oid = ObjectID(commit_id) if oid not in visited: stack.append(oid) while stack: commit_id = stack.pop() if commit_id in visited: continue visited.add(commit_id) try: commit_obj = object_store[commit_id] if not isinstance(commit_obj, Commit): continue # Add to reachable list (commit_id is already hex ObjectID) reachable.append(commit_id) # Add parents to stack for parent_id in commit_obj.parents: if parent_id not in visited: stack.append(parent_id) except KeyError: # Commit not found, skip continue return reachable dulwich-1.0.0/dulwich/config.py000066400000000000000000001521001513301442600164230ustar00rootroot00000000000000# config.py - Reading and writing Git config files # Copyright (C) 2011-2013 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Reading and writing Git configuration files. Todo: * preserve formatting when updating configuration files """ __all__ = [ "DEFAULT_MAX_INCLUDE_DEPTH", "MAX_INCLUDE_FILE_SIZE", "CaseInsensitiveOrderedMultiDict", "ConditionMatcher", "Config", "ConfigDict", "ConfigFile", "ConfigKey", "ConfigValue", "FileOpener", "StackedConfig", "apply_instead_of", "get_win_legacy_system_paths", "get_win_system_paths", "get_xdg_config_home_path", "iter_instead_of", "lower_key", "match_glob_pattern", "parse_submodules", "read_submodules", ] import logging import os import re import sys from collections.abc import ( Callable, ItemsView, Iterable, Iterator, KeysView, Mapping, MutableMapping, ValuesView, ) from contextlib import suppress from pathlib import Path from typing import ( IO, Generic, TypeVar, overload, ) from .file import GitFile, _GitFile ConfigKey = str | bytes | tuple[str | bytes, ...] ConfigValue = str | bytes | bool | int logger = logging.getLogger(__name__) # Type for file opener callback FileOpener = Callable[[str | os.PathLike[str]], IO[bytes]] # Type for includeIf condition matcher # Takes the condition value (e.g., "main" for onbranch:main) and returns bool ConditionMatcher = Callable[[str], bool] # Security limits for include files MAX_INCLUDE_FILE_SIZE = 1024 * 1024 # 1MB max for included config files DEFAULT_MAX_INCLUDE_DEPTH = 10 # Maximum recursion depth for includes def _match_gitdir_pattern( path: bytes, pattern: bytes, ignorecase: bool = False ) -> bool: """Simple gitdir pattern matching for includeIf conditions. This handles the basic gitdir patterns used in includeIf directives. """ # Convert to strings for easier manipulation path_str = path.decode("utf-8", errors="replace") pattern_str = pattern.decode("utf-8", errors="replace") # Normalize paths to use forward slashes for consistent matching path_str = path_str.replace("\\", "/") pattern_str = pattern_str.replace("\\", "/") if ignorecase: path_str = path_str.lower() pattern_str = pattern_str.lower() # Handle the common cases for gitdir patterns if pattern_str.startswith("**/") and pattern_str.endswith("/**"): # Pattern like **/dirname/** should match any path containing dirname dirname = pattern_str[3:-3] # Remove **/ and /** # Check if path contains the directory name as a path component return ("/" + dirname + "/") in path_str or path_str.endswith("/" + dirname) elif pattern_str.startswith("**/"): # Pattern like **/filename suffix = pattern_str[3:] # Remove **/ return suffix in path_str or path_str.endswith("/" + suffix) elif pattern_str.endswith("/**"): # Pattern like /path/to/dir/** should match /path/to/dir and any subdirectory base_pattern = pattern_str[:-3] # Remove /** return path_str == base_pattern or path_str.startswith(base_pattern + "/") elif "**" in pattern_str: # Handle patterns with ** in the middle parts = pattern_str.split("**") if len(parts) == 2: prefix, suffix = parts # Path must start with prefix and end with suffix (if any) if prefix and not path_str.startswith(prefix): return False if suffix and not path_str.endswith(suffix): return False return True # Direct match or simple glob pattern if "*" in pattern_str or "?" in pattern_str or "[" in pattern_str: import fnmatch return fnmatch.fnmatch(path_str, pattern_str) else: return path_str == pattern_str def match_glob_pattern(value: str, pattern: str) -> bool: r"""Match a value against a glob pattern. Supports simple glob patterns like ``*`` and ``**``. Raises: ValueError: If the pattern is invalid """ # Convert glob pattern to regex pattern_escaped = re.escape(pattern) # Replace escaped \*\* with .* (match anything) pattern_escaped = pattern_escaped.replace(r"\*\*", ".*") # Replace escaped \* with [^/]* (match anything except /) pattern_escaped = pattern_escaped.replace(r"\*", "[^/]*") # Anchor the pattern pattern_regex = f"^{pattern_escaped}$" try: return bool(re.match(pattern_regex, value)) except re.error as e: raise ValueError(f"Invalid glob pattern {pattern!r}: {e}") def lower_key(key: ConfigKey) -> ConfigKey: """Convert a config key to lowercase, preserving subsection case. Args: key: Configuration key (str, bytes, or tuple) Returns: Key with section names lowercased, subsection names preserved Raises: TypeError: If key is not str, bytes, or tuple """ if isinstance(key, (bytes, str)): return key.lower() if isinstance(key, tuple): # For config sections, only lowercase the section name (first element) # but preserve the case of subsection names (remaining elements) if len(key) > 0: first = key[0] assert isinstance(first, (bytes, str)) return (first.lower(), *key[1:]) return key raise TypeError(key) K = TypeVar("K", bound=ConfigKey) # Key type must be ConfigKey V = TypeVar("V") # Value type _T = TypeVar("_T") # For get() default parameter class CaseInsensitiveOrderedMultiDict(MutableMapping[K, V], Generic[K, V]): """A case-insensitive ordered dictionary that can store multiple values per key. This class maintains the order of insertions and allows multiple values for the same key. Keys are compared case-insensitively. """ def __init__(self, default_factory: Callable[[], V] | None = None) -> None: """Initialize a CaseInsensitiveOrderedMultiDict. Args: default_factory: Optional factory function for default values """ self._real: list[tuple[K, V]] = [] self._keyed: dict[ConfigKey, V] = {} self._default_factory = default_factory @classmethod def make( cls, dict_in: "MutableMapping[K, V] | CaseInsensitiveOrderedMultiDict[K, V] | None" = None, default_factory: Callable[[], V] | None = None, ) -> "CaseInsensitiveOrderedMultiDict[K, V]": """Create a CaseInsensitiveOrderedMultiDict from an existing mapping. Args: dict_in: Optional mapping to initialize from default_factory: Optional factory function for default values Returns: New CaseInsensitiveOrderedMultiDict instance Raises: TypeError: If dict_in is not a mapping or None """ if isinstance(dict_in, cls): return dict_in out = cls(default_factory=default_factory) if dict_in is None: return out if not isinstance(dict_in, MutableMapping): raise TypeError for key, value in dict_in.items(): out[key] = value return out def __len__(self) -> int: """Return the number of unique keys in the dictionary.""" return len(self._keyed) def keys(self) -> KeysView[K]: """Return a view of the dictionary's keys.""" # Return a view of the original keys (not lowercased) # We need to deduplicate since _real can have duplicates seen = set() unique_keys = [] for k, _ in self._real: lower = lower_key(k) if lower not in seen: seen.add(lower) unique_keys.append(k) from collections.abc import KeysView as ABCKeysView class UniqueKeysView(ABCKeysView[K]): def __init__(self, keys: list[K]): self._keys = keys def __contains__(self, key: object) -> bool: return key in self._keys def __iter__(self) -> Iterator[K]: return iter(self._keys) def __len__(self) -> int: return len(self._keys) return UniqueKeysView(unique_keys) def items(self) -> ItemsView[K, V]: """Return a view of the dictionary's (key, value) pairs in insertion order.""" # Return a view that iterates over the real list to preserve order class OrderedItemsView(ItemsView[K, V]): """Items view that preserves insertion order.""" def __init__(self, mapping: CaseInsensitiveOrderedMultiDict[K, V]): self._mapping = mapping def __iter__(self) -> Iterator[tuple[K, V]]: return iter(self._mapping._real) def __len__(self) -> int: return len(self._mapping._real) def __contains__(self, item: object) -> bool: if not isinstance(item, tuple) or len(item) != 2: return False key, value = item return any(k == key and v == value for k, v in self._mapping._real) return OrderedItemsView(self) def __iter__(self) -> Iterator[K]: """Iterate over the dictionary's keys.""" # Return iterator over original keys (not lowercased), deduplicated seen = set() for k, _ in self._real: lower = lower_key(k) if lower not in seen: seen.add(lower) yield k def values(self) -> ValuesView[V]: """Return a view of the dictionary's values.""" return self._keyed.values() def __setitem__(self, key: K, value: V) -> None: """Set a value for a key, appending to existing values.""" self._real.append((key, value)) self._keyed[lower_key(key)] = value def set(self, key: K, value: V) -> None: """Set a value for a key, replacing all existing values. Args: key: The key to set value: The value to set """ # This method replaces all existing values for the key lower = lower_key(key) self._real = [(k, v) for k, v in self._real if lower_key(k) != lower] self._real.append((key, value)) self._keyed[lower] = value def __delitem__(self, key: K) -> None: """Delete all values for a key. Raises: KeyError: If the key is not found """ lower_k = lower_key(key) del self._keyed[lower_k] for i, (actual, unused_value) in reversed(list(enumerate(self._real))): if lower_key(actual) == lower_k: del self._real[i] def __getitem__(self, item: K) -> V: """Get the last value for a key. Raises: KeyError: If the key is not found """ return self._keyed[lower_key(item)] def get(self, key: K, /, default: V | _T | None = None) -> V | _T | None: # type: ignore[override] """Get the last value for a key, or a default if not found. Args: key: The key to look up default: Default value to return if key not found Returns: The value for the key, or default/default_factory result if not found """ try: return self[key] except KeyError: if default is not None: return default elif self._default_factory is not None: return self._default_factory() else: return None def get_all(self, key: K) -> Iterator[V]: """Get all values for a key in insertion order. Args: key: The key to look up Returns: Iterator of all values for the key """ lowered_key = lower_key(key) for actual, value in self._real: if lower_key(actual) == lowered_key: yield value def setdefault(self, key: K, default: V | None = None) -> V: """Get value for key, setting it to default if not present. Args: key: The key to look up default: Default value to set if key not found Returns: The existing value or the newly set default Raises: KeyError: If key not found and no default or default_factory """ try: return self[key] except KeyError: if default is not None: self[key] = default return default elif self._default_factory is not None: value = self._default_factory() self[key] = value return value else: raise Name = bytes NameLike = bytes | str Section = tuple[bytes, ...] SectionLike = bytes | str | tuple[bytes | str, ...] Value = bytes ValueLike = bytes | str class Config: """A Git configuration.""" def get(self, section: SectionLike, name: NameLike) -> Value: """Retrieve the contents of a configuration setting. Args: section: Tuple with section name and optional subsection name name: Variable name Returns: Contents of the setting Raises: KeyError: if the value is not set """ raise NotImplementedError(self.get) def get_multivar(self, section: SectionLike, name: NameLike) -> Iterator[Value]: """Retrieve the contents of a multivar configuration setting. Args: section: Tuple with section name and optional subsection namee name: Variable name Returns: Contents of the setting as iterable Raises: KeyError: if the value is not set """ raise NotImplementedError(self.get_multivar) @overload def get_boolean( self, section: SectionLike, name: NameLike, default: bool ) -> bool: ... @overload def get_boolean(self, section: SectionLike, name: NameLike) -> bool | None: ... def get_boolean( self, section: SectionLike, name: NameLike, default: bool | None = None ) -> bool | None: """Retrieve a configuration setting as boolean. Args: section: Tuple with section name and optional subsection name name: Name of the setting, including section and possible subsection. default: Default value if setting is not found Returns: Contents of the setting """ try: value = self.get(section, name) except KeyError: return default if value.lower() == b"true": return True elif value.lower() == b"false": return False raise ValueError(f"not a valid boolean string: {value!r}") def set( self, section: SectionLike, name: NameLike, value: ValueLike | bool ) -> None: """Set a configuration value. Args: section: Tuple with section name and optional subsection namee name: Name of the configuration value, including section and optional subsection value: value of the setting """ raise NotImplementedError(self.set) def items(self, section: SectionLike) -> Iterator[tuple[Name, Value]]: """Iterate over the configuration pairs for a specific section. Args: section: Tuple with section name and optional subsection namee Returns: Iterator over (name, value) pairs """ raise NotImplementedError(self.items) def sections(self) -> Iterator[Section]: """Iterate over the sections. Returns: Iterator over section tuples """ raise NotImplementedError(self.sections) def has_section(self, name: Section) -> bool: """Check if a specified section exists. Args: name: Name of section to check for Returns: boolean indicating whether the section exists """ return name in self.sections() class ConfigDict(Config): """Git configuration stored in a dictionary.""" def __init__( self, values: MutableMapping[Section, CaseInsensitiveOrderedMultiDict[Name, Value]] | None = None, encoding: str | None = None, ) -> None: """Create a new ConfigDict.""" if encoding is None: encoding = sys.getdefaultencoding() self.encoding = encoding self._values: CaseInsensitiveOrderedMultiDict[ Section, CaseInsensitiveOrderedMultiDict[Name, Value] ] = CaseInsensitiveOrderedMultiDict.make( values, default_factory=CaseInsensitiveOrderedMultiDict ) def __repr__(self) -> str: """Return string representation of ConfigDict.""" return f"{self.__class__.__name__}({self._values!r})" def __eq__(self, other: object) -> bool: """Check equality with another ConfigDict.""" return isinstance(other, self.__class__) and other._values == self._values def __getitem__(self, key: Section) -> CaseInsensitiveOrderedMultiDict[Name, Value]: """Get configuration values for a section. Raises: KeyError: If section not found """ return self._values.__getitem__(key) def __setitem__( self, key: Section, value: CaseInsensitiveOrderedMultiDict[Name, Value] ) -> None: """Set configuration values for a section.""" return self._values.__setitem__(key, value) def __delitem__(self, key: Section) -> None: """Delete a configuration section. Raises: KeyError: If section not found """ return self._values.__delitem__(key) def __iter__(self) -> Iterator[Section]: """Iterate over configuration sections.""" return self._values.__iter__() def __len__(self) -> int: """Return the number of sections.""" return self._values.__len__() def keys(self) -> KeysView[Section]: """Return a view of section names.""" return self._values.keys() @classmethod def _parse_setting(cls, name: str) -> tuple[str, str | None, str]: parts = name.split(".") if len(parts) == 3: return (parts[0], parts[1], parts[2]) else: return (parts[0], None, parts[1]) def _check_section_and_name( self, section: SectionLike, name: NameLike ) -> tuple[Section, Name]: if not isinstance(section, tuple): section = (section,) checked_section = tuple( [ subsection.encode(self.encoding) if not isinstance(subsection, bytes) else subsection for subsection in section ] ) if not isinstance(name, bytes): name = name.encode(self.encoding) return checked_section, name def get_multivar(self, section: SectionLike, name: NameLike) -> Iterator[Value]: """Get multiple values for a configuration setting. Args: section: Section name name: Setting name Returns: Iterator of configuration values """ section, name = self._check_section_and_name(section, name) if len(section) > 1: try: return self._values[section].get_all(name) except KeyError: pass return self._values[(section[0],)].get_all(name) def get( self, section: SectionLike, name: NameLike, ) -> Value: """Get a configuration value. Args: section: Section name name: Setting name Returns: Configuration value Raises: KeyError: if the value is not set """ section, name = self._check_section_and_name(section, name) if len(section) > 1: try: return self._values[section][name] except KeyError: pass return self._values[(section[0],)][name] def set( self, section: SectionLike, name: NameLike, value: ValueLike | bool, ) -> None: """Set a configuration value. Args: section: Section name name: Setting name value: Configuration value """ section, name = self._check_section_and_name(section, name) if isinstance(value, bool): value = b"true" if value else b"false" if not isinstance(value, bytes): value = value.encode(self.encoding) section_dict = self._values.setdefault(section) if hasattr(section_dict, "set"): section_dict.set(name, value) else: section_dict[name] = value def add( self, section: SectionLike, name: NameLike, value: ValueLike | bool, ) -> None: """Add a value to a configuration setting, creating a multivar if needed.""" section, name = self._check_section_and_name(section, name) if isinstance(value, bool): value = b"true" if value else b"false" if not isinstance(value, bytes): value = value.encode(self.encoding) self._values.setdefault(section)[name] = value def remove(self, section: SectionLike, name: NameLike) -> None: """Remove a configuration setting. Args: section: Section name name: Setting name Raises: KeyError: If the section or name doesn't exist """ section, name = self._check_section_and_name(section, name) del self._values[section][name] def items(self, section: SectionLike) -> Iterator[tuple[Name, Value]]: """Get items in a section.""" section_bytes, _ = self._check_section_and_name(section, b"") section_dict = self._values.get(section_bytes) if section_dict is not None: return iter(section_dict.items()) return iter([]) def sections(self) -> Iterator[Section]: """Get all sections.""" return iter(self._values.keys()) def _format_string(value: bytes) -> bytes: if ( value.startswith((b" ", b"\t")) or value.endswith((b" ", b"\t")) or b"#" in value ): return b'"' + _escape_value(value) + b'"' else: return _escape_value(value) _ESCAPE_TABLE = { ord(b"\\"): ord(b"\\"), ord(b'"'): ord(b'"'), ord(b"n"): ord(b"\n"), ord(b"t"): ord(b"\t"), ord(b"b"): ord(b"\b"), } _COMMENT_CHARS = [ord(b"#"), ord(b";")] _WHITESPACE_CHARS = [ord(b"\t"), ord(b" ")] def _parse_string(value: bytes) -> bytes: value_array = bytearray(value.strip()) ret = bytearray() whitespace = bytearray() in_quotes = False i = 0 while i < len(value_array): c = value_array[i] if c == ord(b"\\"): i += 1 if i >= len(value_array): # Backslash at end of string - treat as literal backslash if whitespace: ret.extend(whitespace) whitespace = bytearray() ret.append(ord(b"\\")) else: try: v = _ESCAPE_TABLE[value_array[i]] if whitespace: ret.extend(whitespace) whitespace = bytearray() ret.append(v) except KeyError: # Unknown escape sequence - treat backslash as literal and process next char normally if whitespace: ret.extend(whitespace) whitespace = bytearray() ret.append(ord(b"\\")) i -= 1 # Reprocess the character after the backslash elif c == ord(b'"'): in_quotes = not in_quotes elif c in _COMMENT_CHARS and not in_quotes: # the rest of the line is a comment break elif c in _WHITESPACE_CHARS: whitespace.append(c) else: if whitespace: ret.extend(whitespace) whitespace = bytearray() ret.append(c) i += 1 if in_quotes: raise ValueError("missing end quote") return bytes(ret) def _escape_value(value: bytes) -> bytes: """Escape a value.""" value = value.replace(b"\\", b"\\\\") value = value.replace(b"\r", b"\\r") value = value.replace(b"\n", b"\\n") value = value.replace(b"\t", b"\\t") value = value.replace(b'"', b'\\"') return value def _check_variable_name(name: bytes) -> bool: for i in range(len(name)): c = name[i : i + 1] if not c.isalnum() and c != b"-": return False return True def _check_section_name(name: bytes) -> bool: for i in range(len(name)): c = name[i : i + 1] if not c.isalnum() and c not in (b"-", b"."): return False return True def _strip_comments(line: bytes) -> bytes: comment_bytes = {ord(b"#"), ord(b";")} quote = ord(b'"') string_open = False # Normalize line to bytearray for simple 2/3 compatibility for i, character in enumerate(bytearray(line)): # Comment characters outside balanced quotes denote comment start if character == quote: string_open = not string_open elif not string_open and character in comment_bytes: return line[:i] return line def _is_line_continuation(value: bytes) -> bool: """Check if a value ends with a line continuation backslash. A line continuation occurs when a line ends with a backslash that is: 1. Not escaped (not preceded by another backslash) 2. Not within quotes Args: value: The value to check Returns: True if the value ends with a line continuation backslash """ if not value.endswith((b"\\\n", b"\\\r\n")): return False # Remove only the newline characters, keep the content including the backslash if value.endswith(b"\\\r\n"): content = value[:-2] # Remove \r\n, keep the \ else: content = value[:-1] # Remove \n, keep the \ if not content.endswith(b"\\"): return False # Count consecutive backslashes at the end backslash_count = 0 for i in range(len(content) - 1, -1, -1): if content[i : i + 1] == b"\\": backslash_count += 1 else: break # If we have an odd number of backslashes, the last one is a line continuation # If we have an even number, they are all escaped and there's no continuation return backslash_count % 2 == 1 def _parse_section_header_line(line: bytes) -> tuple[Section, bytes]: # Parse section header ("[bla]") line = _strip_comments(line).rstrip() in_quotes = False escaped = False for i, c in enumerate(line): if escaped: escaped = False continue if c == ord(b'"'): in_quotes = not in_quotes if c == ord(b"\\"): escaped = True if c == ord(b"]") and not in_quotes: last = i break else: raise ValueError("expected trailing ]") pts = line[1:last].split(b" ", 1) line = line[last + 1 :] section: Section if len(pts) == 2: # Handle subsections - Git allows more complex syntax for certain sections like includeIf if pts[1][:1] == b'"' and pts[1][-1:] == b'"': # Standard quoted subsection pts[1] = pts[1][1:-1] elif pts[0] == b"includeIf": # Special handling for includeIf sections which can have complex conditions # Git allows these without strict quote validation pts[1] = pts[1].strip() if pts[1][:1] == b'"' and pts[1][-1:] == b'"': pts[1] = pts[1][1:-1] else: # Other sections must have quoted subsections raise ValueError(f"Invalid subsection {pts[1]!r}") if not _check_section_name(pts[0]): raise ValueError(f"invalid section name {pts[0]!r}") section = (pts[0], pts[1]) else: if not _check_section_name(pts[0]): raise ValueError(f"invalid section name {pts[0]!r}") pts = pts[0].split(b".", 1) if len(pts) == 2: section = (pts[0], pts[1]) else: section = (pts[0],) return section, line class ConfigFile(ConfigDict): """A Git configuration file, like .git/config or ~/.gitconfig.""" def __init__( self, values: MutableMapping[Section, CaseInsensitiveOrderedMultiDict[Name, Value]] | None = None, encoding: str | None = None, ) -> None: """Initialize a ConfigFile. Args: values: Optional mapping of configuration values encoding: Optional encoding for the file (defaults to system encoding) """ super().__init__(values=values, encoding=encoding) self.path: str | None = None self._included_paths: set[str] = set() # Track included files to prevent cycles @classmethod def from_file( cls, f: IO[bytes], *, config_dir: str | None = None, included_paths: set[str] | None = None, include_depth: int = 0, max_include_depth: int = DEFAULT_MAX_INCLUDE_DEPTH, file_opener: FileOpener | None = None, condition_matchers: Mapping[str, ConditionMatcher] | None = None, ) -> "ConfigFile": """Read configuration from a file-like object. Args: f: File-like object to read from config_dir: Directory containing the config file (for relative includes) included_paths: Set of already included paths (to prevent cycles) include_depth: Current include depth (to prevent infinite recursion) max_include_depth: Maximum allowed include depth file_opener: Optional callback to open included files condition_matchers: Optional dict of condition matchers for includeIf """ if include_depth > max_include_depth: # Prevent excessive recursion raise ValueError(f"Maximum include depth ({max_include_depth}) exceeded") ret = cls() if included_paths is not None: ret._included_paths = included_paths.copy() section: Section | None = None setting = None continuation = None for lineno, line in enumerate(f.readlines()): if lineno == 0 and line.startswith(b"\xef\xbb\xbf"): line = line[3:] line = line.lstrip() if setting is None: if len(line) > 0 and line[:1] == b"[": section, line = _parse_section_header_line(line) ret._values.setdefault(section) if _strip_comments(line).strip() == b"": continue if section is None: raise ValueError(f"setting {line!r} without section") try: setting, value = line.split(b"=", 1) except ValueError: setting = line value = b"true" setting = setting.strip() if not _check_variable_name(setting): raise ValueError(f"invalid variable name {setting!r}") if _is_line_continuation(value): if value.endswith(b"\\\r\n"): continuation = value[:-3] else: continuation = value[:-2] else: continuation = None value = _parse_string(value) ret._values[section][setting] = value # Process include/includeIf directives ret._handle_include_directive( section, setting, value, config_dir=config_dir, include_depth=include_depth, max_include_depth=max_include_depth, file_opener=file_opener, condition_matchers=condition_matchers, ) setting = None else: # continuation line assert continuation is not None if _is_line_continuation(line): if line.endswith(b"\\\r\n"): continuation += line[:-3] else: continuation += line[:-2] else: continuation += line value = _parse_string(continuation) assert section is not None # Already checked above ret._values[section][setting] = value # Process include/includeIf directives ret._handle_include_directive( section, setting, value, config_dir=config_dir, include_depth=include_depth, max_include_depth=max_include_depth, file_opener=file_opener, condition_matchers=condition_matchers, ) continuation = None setting = None return ret def _handle_include_directive( self, section: Section | None, setting: bytes, value: bytes, *, config_dir: str | None, include_depth: int, max_include_depth: int, file_opener: FileOpener | None, condition_matchers: Mapping[str, ConditionMatcher] | None, ) -> None: """Handle include/includeIf directives during config parsing.""" if ( section is not None and setting == b"path" and ( section[0].lower() == b"include" or (len(section) > 1 and section[0].lower() == b"includeif") ) ): self._process_include( section, value, config_dir=config_dir, include_depth=include_depth, max_include_depth=max_include_depth, file_opener=file_opener, condition_matchers=condition_matchers, ) def _process_include( self, section: Section, path_value: bytes, *, config_dir: str | None, include_depth: int, max_include_depth: int, file_opener: FileOpener | None, condition_matchers: Mapping[str, ConditionMatcher] | None, ) -> None: """Process an include or includeIf directive.""" path_str = path_value.decode(self.encoding, errors="replace") # Handle includeIf conditions if len(section) > 1 and section[0].lower() == b"includeif": condition = section[1].decode(self.encoding, errors="replace") if not self._evaluate_includeif_condition( condition, config_dir, condition_matchers ): return # Resolve the include path include_path = self._resolve_include_path(path_str, config_dir) if not include_path: return # Check for circular includes try: abs_path = str(Path(include_path).resolve()) except (OSError, ValueError) as e: # Invalid path - log and skip logger.debug("Invalid include path %r: %s", include_path, e) return if abs_path in self._included_paths: return # Load and merge the included file try: # Use provided file opener or default to GitFile opener: FileOpener if file_opener is None: def opener(path: str | os.PathLike[str]) -> IO[bytes]: return GitFile(path, "rb") else: opener = file_opener f = opener(include_path) except (OSError, ValueError) as e: # Git silently ignores missing or unreadable include files # Log for debugging purposes logger.debug("Invalid include path %r: %s", include_path, e) else: with f as included_file: # Track this path to prevent cycles self._included_paths.add(abs_path) # Parse the included file included_config = ConfigFile.from_file( included_file, config_dir=os.path.dirname(include_path), included_paths=self._included_paths, include_depth=include_depth + 1, max_include_depth=max_include_depth, file_opener=file_opener, condition_matchers=condition_matchers, ) # Merge the included configuration self._merge_config(included_config) def _merge_config(self, other: "ConfigFile") -> None: """Merge another config file into this one.""" for section, values in other._values.items(): if section not in self._values: self._values[section] = CaseInsensitiveOrderedMultiDict() for key, value in values.items(): self._values[section][key] = value def _resolve_include_path(self, path: str, config_dir: str | None) -> str | None: """Resolve an include path to an absolute path.""" # Expand ~ to home directory path = os.path.expanduser(path) # If path is relative and we have a config directory, make it relative to that if not os.path.isabs(path) and config_dir: path = os.path.join(config_dir, path) return path def _evaluate_includeif_condition( self, condition: str, config_dir: str | None = None, condition_matchers: Mapping[str, ConditionMatcher] | None = None, ) -> bool: """Evaluate an includeIf condition.""" # Try custom matchers first if provided if condition_matchers: for prefix, matcher in condition_matchers.items(): if condition.startswith(prefix): return matcher(condition[len(prefix) :]) # Fall back to built-in matchers if condition.startswith("hasconfig:"): return self._evaluate_hasconfig_condition(condition[10:]) else: # Unknown condition type - log and ignore (Git behavior) logger.debug("Unknown includeIf condition: %r", condition) return False def _evaluate_hasconfig_condition(self, condition: str) -> bool: """Evaluate a hasconfig condition. Format: hasconfig:config.key:pattern Example: hasconfig:remote.*.url:ssh://org-*@github.com/** """ # Split on the first colon to separate config key from pattern parts = condition.split(":", 1) if len(parts) != 2: logger.debug("Invalid hasconfig condition format: %r", condition) return False config_key, pattern = parts # Parse the config key to get section and name key_parts = config_key.split(".", 2) if len(key_parts) < 2: logger.debug("Invalid hasconfig config key: %r", config_key) return False # Handle wildcards in section names (e.g., remote.*) if len(key_parts) == 3 and key_parts[1] == "*": # Match any subsection section_prefix = key_parts[0].encode(self.encoding) name = key_parts[2].encode(self.encoding) # Check all sections that match the pattern for section in self.sections(): if len(section) == 2 and section[0] == section_prefix: try: values = list(self.get_multivar(section, name)) for value in values: if self._match_hasconfig_pattern(value, pattern): return True except KeyError: continue else: # Direct section lookup if len(key_parts) == 2: section = (key_parts[0].encode(self.encoding),) name = key_parts[1].encode(self.encoding) else: section = ( key_parts[0].encode(self.encoding), key_parts[1].encode(self.encoding), ) name = key_parts[2].encode(self.encoding) try: values = list(self.get_multivar(section, name)) for value in values: if self._match_hasconfig_pattern(value, pattern): return True except KeyError: pass return False def _match_hasconfig_pattern(self, value: bytes, pattern: str) -> bool: """Match a config value against a hasconfig pattern. Supports simple glob patterns like ``*`` and ``**``. """ value_str = value.decode(self.encoding, errors="replace") return match_glob_pattern(value_str, pattern) @classmethod def from_path( cls, path: str | os.PathLike[str], *, max_include_depth: int = DEFAULT_MAX_INCLUDE_DEPTH, file_opener: FileOpener | None = None, condition_matchers: Mapping[str, ConditionMatcher] | None = None, ) -> "ConfigFile": """Read configuration from a file on disk. Args: path: Path to the configuration file max_include_depth: Maximum allowed include depth file_opener: Optional callback to open included files condition_matchers: Optional dict of condition matchers for includeIf """ abs_path = os.fspath(path) config_dir = os.path.dirname(abs_path) # Use provided file opener or default to GitFile opener: FileOpener if file_opener is None: def opener(p: str | os.PathLike[str]) -> IO[bytes]: return GitFile(p, "rb") else: opener = file_opener with opener(abs_path) as f: ret = cls.from_file( f, config_dir=config_dir, max_include_depth=max_include_depth, file_opener=file_opener, condition_matchers=condition_matchers, ) ret.path = abs_path return ret def write_to_path(self, path: str | os.PathLike[str] | None = None) -> None: """Write configuration to a file on disk.""" if path is None: if self.path is None: raise ValueError("No path specified and no default path available") path_to_use: str | os.PathLike[str] = self.path else: path_to_use = path with GitFile(path_to_use, "wb") as f: self.write_to_file(f) def write_to_file(self, f: IO[bytes] | _GitFile) -> None: """Write configuration to a file-like object.""" for section, values in self._values.items(): try: section_name, subsection_name = section except ValueError: (section_name,) = section subsection_name = None if subsection_name is None: f.write(b"[" + section_name + b"]\n") else: f.write(b"[" + section_name + b' "' + subsection_name + b'"]\n') for key, value in values.items(): value = _format_string(value) f.write(b"\t" + key + b" = " + value + b"\n") def get_xdg_config_home_path(*path_segments: str) -> str: """Get a path in the XDG config home directory. Args: *path_segments: Path segments to join to the XDG config home Returns: Full path in XDG config home directory """ xdg_config_home = os.environ.get( "XDG_CONFIG_HOME", os.path.expanduser("~/.config/"), ) return os.path.join(xdg_config_home, *path_segments) def _find_git_in_win_path() -> Iterator[str]: for exe in ("git.exe", "git.cmd"): for path in os.environ.get("PATH", "").split(";"): if os.path.exists(os.path.join(path, exe)): # in windows native shells (powershell/cmd) exe path is # .../Git/bin/git.exe or .../Git/cmd/git.exe # # in git-bash exe path is .../Git/mingw64/bin/git.exe git_dir, _bin_dir = os.path.split(path) yield git_dir parent_dir, basename = os.path.split(git_dir) if basename == "mingw32" or basename == "mingw64": yield parent_dir break def _find_git_in_win_reg() -> Iterator[str]: import platform import winreg if platform.machine() == "AMD64": subkey = ( "SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\" "CurrentVersion\\Uninstall\\Git_is1" ) else: subkey = "SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\Git_is1" for key in (winreg.HKEY_CURRENT_USER, winreg.HKEY_LOCAL_MACHINE): # type: ignore[attr-defined,unused-ignore] with suppress(OSError): with winreg.OpenKey(key, subkey) as k: # type: ignore[attr-defined,unused-ignore] val, typ = winreg.QueryValueEx(k, "InstallLocation") # type: ignore[attr-defined,unused-ignore] if typ == winreg.REG_SZ: # type: ignore[attr-defined,unused-ignore] yield val # There is no set standard for system config dirs on windows. We try the # following: # - %PROGRAMFILES%/Git/etc/gitconfig - Git for Windows (msysgit) config dir # Used if CGit installation (Git/bin/git.exe) is found in PATH in the # system registry def get_win_system_paths() -> Iterator[str]: """Get current Windows system Git config paths. Only returns the current Git for Windows config location, not legacy paths. """ # Try to find Git installation from PATH first for git_dir in _find_git_in_win_path(): yield os.path.join(git_dir, "etc", "gitconfig") return # Only use the first found path # Fall back to registry if not found in PATH for git_dir in _find_git_in_win_reg(): yield os.path.join(git_dir, "etc", "gitconfig") return # Only use the first found path def get_win_legacy_system_paths() -> Iterator[str]: """Get legacy Windows system Git config paths. Returns all possible config paths including deprecated locations. This function can be used for diagnostics or migration purposes. """ # Include deprecated PROGRAMDATA location if "PROGRAMDATA" in os.environ: yield os.path.join(os.environ["PROGRAMDATA"], "Git", "config") # Include all Git installations found for git_dir in _find_git_in_win_path(): yield os.path.join(git_dir, "etc", "gitconfig") for git_dir in _find_git_in_win_reg(): yield os.path.join(git_dir, "etc", "gitconfig") class StackedConfig(Config): """Configuration which reads from multiple config files..""" def __init__( self, backends: list[ConfigFile], writable: ConfigFile | None = None ) -> None: """Initialize a StackedConfig. Args: backends: List of config files to read from (in order of precedence) writable: Optional config file to write changes to """ self.backends = backends self.writable = writable def __repr__(self) -> str: """Return string representation of StackedConfig.""" return f"<{self.__class__.__name__} for {self.backends!r}>" @classmethod def default(cls) -> "StackedConfig": """Create a StackedConfig with default system/user config files. Returns: StackedConfig with default configuration files loaded """ return cls(cls.default_backends()) @classmethod def default_backends(cls) -> list[ConfigFile]: """Retrieve the default configuration. See git-config(1) for details on the files searched. """ paths = [] # Handle GIT_CONFIG_GLOBAL - overrides user config paths try: paths.append(os.environ["GIT_CONFIG_GLOBAL"]) except KeyError: paths.append(os.path.expanduser("~/.gitconfig")) paths.append(get_xdg_config_home_path("git", "config")) # Handle GIT_CONFIG_SYSTEM and GIT_CONFIG_NOSYSTEM try: paths.append(os.environ["GIT_CONFIG_SYSTEM"]) except KeyError: if "GIT_CONFIG_NOSYSTEM" not in os.environ: paths.append("/etc/gitconfig") if sys.platform == "win32": paths.extend(get_win_system_paths()) logger.debug("Loading gitconfig from paths: %s", paths) backends = [] for path in paths: try: cf = ConfigFile.from_path(path) logger.debug("Successfully loaded gitconfig from: %s", path) except FileNotFoundError: logger.debug("Gitconfig file not found: %s", path) continue backends.append(cf) return backends def get(self, section: SectionLike, name: NameLike) -> Value: """Get value from configuration.""" if not isinstance(section, tuple): section = (section,) for backend in self.backends: try: return backend.get(section, name) except KeyError: pass raise KeyError(name) def get_multivar(self, section: SectionLike, name: NameLike) -> Iterator[Value]: """Get multiple values from configuration.""" if not isinstance(section, tuple): section = (section,) for backend in self.backends: try: yield from backend.get_multivar(section, name) except KeyError: pass def set( self, section: SectionLike, name: NameLike, value: ValueLike | bool ) -> None: """Set value in configuration.""" if self.writable is None: raise NotImplementedError(self.set) return self.writable.set(section, name, value) def sections(self) -> Iterator[Section]: """Get all sections.""" seen = set() for backend in self.backends: for section in backend.sections(): if section not in seen: seen.add(section) yield section def read_submodules( path: str | os.PathLike[str], ) -> Iterator[tuple[bytes, bytes, bytes]]: """Read a .gitmodules file.""" cfg = ConfigFile.from_path(path) return parse_submodules(cfg) def parse_submodules(config: ConfigFile) -> Iterator[tuple[bytes, bytes, bytes]]: """Parse a gitmodules GitConfig file, returning submodules. Args: config: A `ConfigFile` Returns: list of tuples (submodule path, url, name), where name is quoted part of the section's name. """ for section in config.sections(): section_kind, section_name = section if section_kind == b"submodule": try: sm_path = config.get(section, b"path") sm_url = config.get(section, b"url") yield (sm_path, sm_url, section_name) except KeyError: # If either path or url is missing, just ignore this # submodule entry and move on to the next one. This is # how git itself handles malformed .gitmodule entries. pass def iter_instead_of(config: Config, push: bool = False) -> Iterable[tuple[str, str]]: """Iterate over insteadOf / pushInsteadOf values.""" for section in config.sections(): if section[0] != b"url": continue replacement = section[1] try: needles = list(config.get_multivar(section, "insteadOf")) except KeyError: needles = [] if push: try: needles += list(config.get_multivar(section, "pushInsteadOf")) except KeyError: pass for needle in needles: assert isinstance(needle, bytes) yield needle.decode("utf-8"), replacement.decode("utf-8") def apply_instead_of(config: Config, orig_url: str, push: bool = False) -> str: """Apply insteadOf / pushInsteadOf to a URL.""" longest_needle = "" updated_url = orig_url for needle, replacement in iter_instead_of(config, push): if not orig_url.startswith(needle): continue if len(longest_needle) < len(needle): longest_needle = needle updated_url = replacement + orig_url[len(needle) :] return updated_url dulwich-1.0.0/dulwich/contrib/000077500000000000000000000000001513301442600162455ustar00rootroot00000000000000dulwich-1.0.0/dulwich/contrib/README.md000066400000000000000000000002661513301442600175300ustar00rootroot00000000000000This directory contains code that some may find useful. Code here is not an official part of Dulwich, and may no longer work. Unlike the rest of Dulwich, it is not regularly tested. dulwich-1.0.0/dulwich/contrib/README.swift.rst000066400000000000000000000113111513301442600210640ustar00rootroot00000000000000Openstack Swift as backend for Dulwich ====================================== Fabien Boucher The module dulwich/contrib/swift.py implements dulwich.repo.BaseRepo in order to being compatible with Openstack Swift. We can then use Dulwich as server (Git server) and instead of using a regular POSIX file system to store repository objects we use the object storage Swift via its own API. c Git client <---> Dulwich server <---> Openstack Swift API This implementation is still a work in progress and we can say that is a Beta version so you need to be prepared to find bugs. Configuration file ------------------ We need to provide some configuration values in order to let Dulwich talk and authenticate against Swift. The following config file must be used as template:: [swift] # Authentication URL (Keystone or Swift) auth_url = http://127.0.0.1:5000/v2.0 # Authentication version to use auth_ver = 2 # The tenant and username separated by a semicolon username = admin;admin # The user password password = pass # The Object storage region to use (auth v2) (Default RegionOne) region_name = RegionOne # The Object storage endpoint URL to use (auth v2) (Default internalURL) endpoint_type = internalURL # Concurrency to use for parallel tasks (Default 10) concurrency = 10 # Size of the HTTP pool (Default 10) http_pool_length = 10 # Timeout delay for HTTP connections (Default 20) http_timeout = 20 # Chunk size to read from pack (Bytes) (Default 12228) chunk_length = 12228 # Cache size (MBytes) (Default 20) cache_length = 20 Note that for now we use the same tenant to perform the requests against Swift. Therefore there is only one Swift account used for storing repositories. Each repository will be contained in a Swift container. How to start unittest --------------------- There is no need to have a Swift cluster running to run the unitests. Just run the following command in the Dulwich source directory:: $ PYTHONPATH=. python -m tests.contrib.test_swift How to start functional tests ----------------------------- We provide some basic tests to perform smoke tests against a real Swift cluster. To run those functional tests you need a properly configured configuration file. The tests can be run as follow:: $ DULWICH_SWIFT_CFG=/etc/swift-dul.conf PYTHONPATH=. python -m tests.contrib.test_swift_smoke How to install -------------- Install the Dulwich library via the setup.py. The dependencies will be automatically retrieved from pypi:: $ python ./setup.py install How to run the server --------------------- Start the server using the following command:: $ python -m dulwich.contrib.swift daemon -c /etc/swift-dul.conf -l 127.0.0.1 Note that a lot of request will be performed against the Swift cluster so it is better to start the Dulwich server as close as possible of the Swift proxy. The best solution is to run the server on the Swift proxy node to reduce the latency. How to use ---------- Once you have validated that the functional tests is working as expected and the server is running we can init a bare repository. Run this command with the name of the repository to create:: $ python -m dulwich.contrib.swift init -c /etc/swift-dul.conf edeploy The repository name will be the container that will contain all the Git objects for the repository. Then standard c Git client can be used to perform operations against this repository. As an example we can clone the previously empty bare repository:: $ git clone git://localhost/edeploy Then push an existing project in it:: $ git clone https://github.com/enovance/edeploy.git edeployclone $ cd edeployclone $ git remote add alt git://localhost/edeploy $ git push alt master $ git ls-remote alt 9dc50a9a9bff1e232a74e365707f22a62492183e HEAD 9dc50a9a9bff1e232a74e365707f22a62492183e refs/heads/master The other Git commands can be used the way you do usually against a regular repository. Note the daemon subcommands starts a Git server listening for the Git protocol. Therefore there is no authentication or encryption at all between the cGIT client and the GIT server (Dulwich). Note on the .info file for pack object -------------------------------------- The Swift interface of Dulwich relies only on the pack format to store Git objects. Instead of using only an index (pack-sha.idx) along with the pack, we add a second file (pack-sha.info). This file is automatically created when a client pushes some references on the repository. The purpose of this file is to speed up pack creation server side when a client fetches some references. Currently this .info format is not optimized and may change in future. dulwich-1.0.0/dulwich/contrib/__init__.py000066400000000000000000000031101513301442600203510ustar00rootroot00000000000000# __init__.py -- Contrib module for Dulwich # Copyright (C) 2014 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Contributed implementations and extensions for dulwich. This package contains various contributed modules that extend dulwich functionality. These modules are maintained as part of dulwich but may have additional dependencies or more specialized use cases. Available modules: - greenthreads: Green-threaded support for finding missing objects - paramiko_vendor: SSH client implementation using paramiko - release_robot: Automated release management utilities - requests_vendor: HTTP client implementation using requests - swift: OpenStack Swift object storage backend """ __all__ = [] dulwich-1.0.0/dulwich/contrib/greenthreads.py000066400000000000000000000117161513301442600213000ustar00rootroot00000000000000# greenthreads.py -- Utility module for querying an ObjectStore with gevent # Copyright (C) 2013 eNovance SAS # # Author: Fabien Boucher # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Utility module for querying an ObjectStore with gevent.""" __all__ = ["GreenThreadsMissingObjectFinder"] from collections.abc import Callable, Sequence import gevent from gevent import pool from ..object_store import ( BaseObjectStore, MissingObjectFinder, _collect_ancestors, _collect_filetree_revs, ) from ..objects import Commit, ObjectID, Tag def _split_commits_and_tags( obj_store: BaseObjectStore, lst: Sequence[ObjectID], *, ignore_unknown: bool = False, pool: pool.Pool, ) -> tuple[set[ObjectID], set[ObjectID]]: """Split object id list into two list with commit SHA1s and tag SHA1s. Same implementation as object_store._split_commits_and_tags except we use gevent to parallelize object retrieval. """ commits = set() tags = set() def find_commit_type(sha: ObjectID) -> None: try: o = obj_store[sha] except KeyError: if not ignore_unknown: raise else: if isinstance(o, Commit): commits.add(sha) elif isinstance(o, Tag): tags.add(sha) commits.add(o.object[1]) else: raise KeyError(f"Not a commit or a tag: {sha!r}") jobs = [pool.spawn(find_commit_type, s) for s in lst] gevent.joinall(jobs) return (commits, tags) class GreenThreadsMissingObjectFinder(MissingObjectFinder): """Find the objects missing from another object store. Same implementation as object_store.MissingObjectFinder except we use gevent to parallelize object retrieval. """ def __init__( self, object_store: BaseObjectStore, haves: Sequence[ObjectID], wants: Sequence[ObjectID], progress: Callable[[bytes], None] | None = None, get_tagged: Callable[[], dict[ObjectID, ObjectID]] | None = None, concurrency: int = 1, get_parents: Callable[[ObjectID], list[ObjectID]] | None = None, ) -> None: """Initialize GreenThreadsMissingObjectFinder. Args: object_store: Object store to search haves: Objects we have wants: Objects we want progress: Optional progress callback get_tagged: Optional function to get tagged objects concurrency: Number of concurrent green threads get_parents: Optional function to get commit parents """ def collect_tree_sha(sha: ObjectID) -> None: self.sha_done.add(sha) obj = object_store[sha] if isinstance(obj, Commit): _collect_filetree_revs(object_store, obj.tree, self.sha_done) self.object_store = object_store p = pool.Pool(size=concurrency) have_commits, have_tags = _split_commits_and_tags( object_store, haves, ignore_unknown=True, pool=p ) want_commits, want_tags = _split_commits_and_tags( object_store, wants, ignore_unknown=False, pool=p ) all_ancestors: frozenset[ObjectID] = frozenset( _collect_ancestors(object_store, have_commits)[0] ) missing_commits, common_commits = _collect_ancestors( object_store, want_commits, all_ancestors ) self.sha_done = set() jobs = [p.spawn(collect_tree_sha, c) for c in common_commits] gevent.joinall(jobs) for t in have_tags: self.sha_done.add(t) missing_tags = want_tags.difference(have_tags) all_wants = missing_commits.union(missing_tags) self.objects_to_send: set[tuple[ObjectID, bytes | None, int | None, bool]] = { (w, None, 0, False) for w in all_wants } if progress is None: self.progress: Callable[[bytes], None] = lambda x: None else: self.progress = progress self._tagged: dict[ObjectID, ObjectID] = (get_tagged and get_tagged()) or {} dulwich-1.0.0/dulwich/contrib/paramiko_vendor.py000066400000000000000000000167251513301442600220120ustar00rootroot00000000000000# paramiko_vendor.py -- paramiko implementation of the SSHVendor interface # Copyright (C) 2013 Aaron O'Mullan # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Paramiko SSH support for Dulwich. To use this implementation as the SSH implementation in Dulwich, override the dulwich.client.get_ssh_vendor attribute: >>> from dulwich import client as _mod_client >>> from dulwich.contrib.paramiko_vendor import ParamikoSSHVendor >>> _mod_client.get_ssh_vendor = ParamikoSSHVendor This implementation has comprehensive tests in tests/contrib/test_paramiko_vendor.py. """ __all__ = [ "ParamikoSSHVendor", ] import os import warnings from typing import Any, BinaryIO, cast import paramiko import paramiko.client import paramiko.config class _ParamikoWrapper: """Wrapper for paramiko SSH channel to provide a file-like interface.""" def __init__(self, client: paramiko.SSHClient, channel: paramiko.Channel) -> None: """Initialize the paramiko wrapper. Args: client: The SSH client instance channel: The SSH channel for communication """ self.client = client self.channel = channel # Channel must block self.channel.setblocking(True) @property def stderr(self) -> BinaryIO: """Get stderr stream from the channel. Returns: Binary IO stream for stderr """ return cast(BinaryIO, self.channel.makefile_stderr("rb")) def can_read(self) -> bool: """Check if data is available to read. Returns: True if data is available """ return self.channel.recv_ready() def write(self, data: bytes) -> None: """Write data to the channel. Args: data: Bytes to write """ return self.channel.sendall(data) def read(self, n: int | None = None) -> bytes: """Read data from the channel. Args: n: Number of bytes to read (default: 4096) Returns: Bytes read from the channel """ data = self.channel.recv(n or 4096) data_len = len(data) # Closed socket if not data: return b"" # Read more if needed if n and data_len < n: diff_len = n - data_len return data + self.read(diff_len) return data def close(self) -> None: """Close the SSH channel.""" self.channel.close() class ParamikoSSHVendor: """SSH vendor implementation using paramiko.""" # http://docs.paramiko.org/en/2.4/api/client.html def __init__(self, **kwargs: object) -> None: """Initialize the paramiko SSH vendor. Args: **kwargs: Additional keyword arguments passed to SSHClient """ self.kwargs = kwargs self.ssh_config = self._load_ssh_config() def _load_ssh_config(self) -> paramiko.config.SSHConfig: """Load SSH configuration from ~/.ssh/config.""" ssh_config = paramiko.config.SSHConfig() config_path = os.path.expanduser("~/.ssh/config") try: with open(config_path) as config_file: ssh_config.parse(config_file) except FileNotFoundError: # Config file doesn't exist - this is normal, ignore silently pass except (OSError, PermissionError) as e: # Config file exists but can't be read - warn user warnings.warn(f"Could not read SSH config file {config_path}: {e}") return ssh_config def run_command( self, host: str, command: bytes, username: str | None = None, port: int | None = None, password: str | None = None, pkey: paramiko.PKey | None = None, key_filename: str | None = None, ssh_command: str | None = None, protocol_version: int | None = None, **kwargs: object, ) -> _ParamikoWrapper: """Run a command on a remote host via SSH. Args: host: Hostname to connect to command: Command to execute (as bytes) username: SSH username (optional) port: SSH port (optional) password: SSH password (optional) pkey: Private key for authentication (optional) key_filename: Path to private key file (optional) ssh_command: SSH command (ignored - Paramiko doesn't use external SSH) protocol_version: SSH protocol version (optional) **kwargs: Additional keyword arguments Returns: _ParamikoWrapper instance for the SSH channel """ # Convert bytes command to str for paramiko command_str = command.decode("utf-8") client = paramiko.SSHClient() # Get SSH config for this host host_config = self.ssh_config.lookup(host) connection_kwargs: dict[str, Any] = { "hostname": host_config.get("hostname", host) } connection_kwargs.update(self.kwargs) # Use SSH config values if not explicitly provided if username: connection_kwargs["username"] = username elif "user" in host_config: connection_kwargs["username"] = host_config["user"] if port: connection_kwargs["port"] = port elif "port" in host_config: connection_kwargs["port"] = int(host_config["port"]) if password: connection_kwargs["password"] = password if pkey: connection_kwargs["pkey"] = pkey if key_filename: connection_kwargs["key_filename"] = key_filename elif "identityfile" in host_config: # Use the first identity file from SSH config identity_files = host_config["identityfile"] if isinstance(identity_files, list) and identity_files: connection_kwargs["key_filename"] = identity_files[0] elif isinstance(identity_files, str): connection_kwargs["key_filename"] = identity_files connection_kwargs.update(kwargs) policy = paramiko.client.MissingHostKeyPolicy() client.set_missing_host_key_policy(policy) client.connect(**connection_kwargs) # Open SSH session transport = client.get_transport() if transport is None: raise RuntimeError("Transport is None") channel = transport.open_session() if protocol_version is None or protocol_version == 2: channel.set_environment_variable(name="GIT_PROTOCOL", value="version=2") # Run commands channel.exec_command(command_str) return _ParamikoWrapper(client, channel) dulwich-1.0.0/dulwich/contrib/release_robot.py000066400000000000000000000134551513301442600214540ustar00rootroot00000000000000# release_robot.py # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Determine last version string from tags. Alternate to `Versioneer `_ using `Dulwich `_ to sort tags by time from newest to oldest. Copy the following into the package ``__init__.py`` module:: from dulwich.contrib.release_robot import get_current_version __version__ = get_current_version() This example assumes the tags have a leading "v" like "v0.3", and that the ``.git`` folder is in a project folder that contains the package folder. EG:: * project | * .git | +-* package | * __init__.py <-- put __version__ here """ __all__ = [ "PATTERN", "PROJDIR", "get_current_version", "get_recent_tags", ] import datetime import logging import re import sys import time from typing import Any, cast from ..repo import Repo # CONSTANTS PROJDIR = "." PATTERN = r"[ a-zA-Z_\-]*([\d\.]+[\-\w\.]*)" def get_recent_tags(projdir: str = PROJDIR) -> list[tuple[str, list[Any]]]: """Get list of tags in order from newest to oldest and their datetimes. Args: projdir: path to ``.git`` Returns: list of tags sorted by commit time from newest to oldest Each tag in the list contains the tag name, commit time, commit id, author and any tag meta. If a tag isn't annotated, then its tag meta is ``None``. Otherwise the tag meta is a tuple containing the tag time, tag id and tag name. Time is in UTC. """ with Repo(projdir) as project: # dulwich repository object refs = project.get_refs() # dictionary of refs and their SHA-1 values tags = {} # empty dictionary to hold tags, commits and datetimes # iterate over refs in repository for key_bytes, value in refs.items(): key = key_bytes.decode("utf-8") # compatible with Python-3 obj = project.get_object(value) # dulwich object from SHA-1 # don't just check if object is "tag" b/c it could be a "commit" # instead check if "tags" is in the ref-name if "tags" not in key: # skip ref if not a tag continue # strip the leading text from refs to get "tag name" _, tag = key.rsplit("/", 1) # check if tag object is "commit" or "tag" pointing to a "commit" from ..objects import Commit, Tag if isinstance(obj, Tag): commit_info = obj.object # a tuple (commit class, commit id) tag_meta = ( datetime.datetime(*time.gmtime(obj.tag_time)[:6]), obj.id.decode("utf-8"), obj.name.decode("utf-8"), ) # compatible with Python-3 commit = project.get_object(commit_info[1]) # commit object else: commit = obj tag_meta = None # get tag commit datetime, but dulwich returns seconds since # beginning of epoch, so use Python time module to convert it to # timetuple then convert to datetime commit_obj = cast(Commit, commit) tags[tag] = [ datetime.datetime(*time.gmtime(commit_obj.commit_time)[:6]), commit_obj.id.decode("utf-8"), commit_obj.author.decode("utf-8"), tag_meta, ] # compatible with Python-3 # return list of tags sorted by their datetimes from newest to oldest return sorted(tags.items(), key=lambda tag: tag[1][0], reverse=True) def get_current_version( projdir: str = PROJDIR, pattern: str = PATTERN, logger: logging.Logger | None = None, ) -> str | None: """Return the most recent tag, using an options regular expression pattern. The default pattern will strip any characters preceding the first semantic version. *EG*: "Release-0.2.1-rc.1" will be come "0.2.1-rc.1". If no match is found, then the most recent tag is return without modification. Args: projdir: path to ``.git`` pattern: regular expression pattern with group that matches version logger: a Python logging instance to capture exception Returns: tag matching first group in regular expression pattern """ tags = get_recent_tags(projdir) try: tag = tags[0][0] except IndexError: return None matches = re.match(pattern, tag) if matches: try: current_version = matches.group(1) return current_version except IndexError as err: if logger: logger.debug("Pattern %r didn't match tag %r: %s", pattern, tag, err) return tag else: if logger: logger.debug("Pattern %r didn't match tag %r", pattern, tag) return tag if __name__ == "__main__": if len(sys.argv) > 1: _PROJDIR = sys.argv[1] else: _PROJDIR = PROJDIR print(get_current_version(projdir=_PROJDIR)) dulwich-1.0.0/dulwich/contrib/requests_vendor.py000066400000000000000000000146551513301442600220620ustar00rootroot00000000000000# requests_vendor.py -- requests implementation of the AbstractHttpGitClient interface # Copyright (C) 2022 Eden Shalit # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Requests HTTP client support for Dulwich. To use this implementation as the HTTP implementation in Dulwich, override the dulwich.client.HttpGitClient attribute: >>> from dulwich import client as _mod_client >>> from dulwich.contrib.requests_vendor import RequestsHttpGitClient >>> _mod_client.HttpGitClient = RequestsHttpGitClient This implementation is experimental and does not have any tests. """ __all__ = [ "RequestsHttpGitClient", "get_session", ] from collections.abc import Callable, Iterator from io import BytesIO from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from ..config import ConfigFile from requests import Session from ..client import ( AbstractHttpGitClient, HTTPProxyUnauthorized, HTTPUnauthorized, default_user_agent_string, ) from ..errors import GitProtocolError, NotGitRepository class RequestsHttpGitClient(AbstractHttpGitClient): """HTTP Git client using the requests library.""" def __init__( self, base_url: str, dumb: bool | None = None, config: "ConfigFile | None" = None, username: str | None = None, password: str | None = None, thin_packs: bool = True, report_activity: Callable[[int, str], None] | None = None, quiet: bool = False, include_tags: bool = False, ) -> None: """Initialize RequestsHttpGitClient. Args: base_url: Base URL of the Git repository dumb: Whether to use dumb HTTP transport config: Git configuration file username: Username for authentication password: Password for authentication thin_packs: Whether to use thin packs report_activity: Function to report activity quiet: Whether to suppress output include_tags: Whether to include tags """ self._username = username self._password = password self.session = get_session(config) if username is not None: self.session.auth = (username, password) # type: ignore[assignment] super().__init__( base_url=base_url, dumb=bool(dumb) if dumb is not None else False, thin_packs=thin_packs, report_activity=report_activity, quiet=quiet, include_tags=include_tags, ) def _http_request( self, url: str, headers: dict[str, str] | None = None, data: bytes | Iterator[bytes] | None = None, raise_for_status: bool = True, ) -> tuple[Any, Callable[[int], bytes]]: req_headers = self.session.headers.copy() # type: ignore[attr-defined] if headers is not None: req_headers.update(headers) # Accept compression by default req_headers.setdefault("Accept-Encoding", "gzip") if data: resp = self.session.post(url, headers=req_headers, data=data) else: resp = self.session.get(url, headers=req_headers) if resp.status_code == 404: raise NotGitRepository if resp.status_code == 401: raise HTTPUnauthorized(resp.headers.get("WWW-Authenticate"), url) if resp.status_code == 407: raise HTTPProxyUnauthorized(resp.headers.get("Proxy-Authenticate"), url) if resp.status_code != 200: raise GitProtocolError(f"unexpected http resp {resp.status_code} for {url}") # Add required fields as stated in AbstractHttpGitClient._http_request resp.content_type = resp.headers.get("Content-Type") # type: ignore[attr-defined] resp.redirect_location = "" # type: ignore[attr-defined] if resp.history: resp.redirect_location = resp.url # type: ignore[attr-defined] read = BytesIO(resp.content).read return resp, read def get_session(config: "ConfigFile | None") -> Session: """Create a requests session with Git configuration. Args: config: Git configuration file Returns: Configured requests Session """ session = Session() session.headers.update({"Pragma": "no-cache"}) proxy_server: str | None = None user_agent: str | None = None ca_certs: str | None = None ssl_verify: bool | None = None if config is not None: try: proxy_bytes = config.get(b"http", b"proxy") if isinstance(proxy_bytes, bytes): proxy_server = proxy_bytes.decode() except KeyError: pass try: agent_bytes = config.get(b"http", b"useragent") if isinstance(agent_bytes, bytes): user_agent = agent_bytes.decode() except KeyError: pass try: ssl_verify = config.get_boolean(b"http", b"sslVerify") except KeyError: ssl_verify = True try: certs_bytes = config.get(b"http", b"sslCAInfo") if isinstance(certs_bytes, bytes): ca_certs = certs_bytes.decode() except KeyError: ca_certs = None if user_agent is None: user_agent = default_user_agent_string() if user_agent is not None: session.headers.update({"User-agent": user_agent}) if ca_certs: session.verify = ca_certs elif ssl_verify is False: session.verify = ssl_verify if proxy_server is not None: session.proxies.update({"http": proxy_server, "https": proxy_server}) return session dulwich-1.0.0/dulwich/contrib/swift.py000066400000000000000000001264501513301442600177630ustar00rootroot00000000000000# swift.py -- Repo implementation atop OpenStack SWIFT # Copyright (C) 2013 eNovance SAS # # Author: Fabien Boucher # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Repo implementation atop OpenStack SWIFT.""" __all__ = [ "PackInfoMissingObjectFinder", "SwiftConnector", "SwiftException", "SwiftInfoRefsContainer", "SwiftObjectStore", "SwiftPack", "SwiftPackData", "SwiftPackReader", "SwiftRepo", "SwiftSystemBackend", "cmd_daemon", "cmd_init", "load_conf", "load_pack_info", "main", "pack_info_create", "swift_load_pack_index", ] # TODO: Refactor to share more code with dulwich/repo.py. # TODO(fbo): Second attempt to _send() must be notified via real log # TODO(fbo): More logs for operations import json import logging import os import posixpath import stat import sys import tempfile import urllib.parse as urlparse import zlib from collections.abc import Callable, Iterator, Mapping from configparser import ConfigParser from io import BytesIO from typing import TYPE_CHECKING, Any, BinaryIO, cast if TYPE_CHECKING: from dulwich.object_format import ObjectFormat from geventhttpclient import HTTPClient from ..file import _GitFile from ..lru_cache import LRUSizeCache from ..object_store import INFODIR, PACKDIR, PackBasedObjectStore from ..objects import S_ISGITLINK, Blob, Commit, ObjectID, Tag, Tree from ..pack import ( ObjectContainer, Pack, PackData, PackIndex, PackIndexer, PackStreamCopier, _compute_object_size, compute_file_sha, iter_sha1, load_pack_index_file, read_pack_header, unpack_object, write_pack_header, write_pack_index_v2, write_pack_object, ) from ..protocol import TCP_GIT_PORT, split_peeled_refs, write_info_refs from ..refs import HEADREF, Ref, RefsContainer, read_info_refs from ..repo import OBJECTDIR, BaseRepo from ..server import Backend, BackendRepo, TCPGitServer from .greenthreads import GreenThreadsMissingObjectFinder """ # Configuration file sample [swift] # Authentication URL (Keystone or Swift) auth_url = http://127.0.0.1:5000/v2.0 # Authentication version to use auth_ver = 2 # The tenant and username separated by a semicolon username = admin;admin # The user password password = pass # The Object storage region to use (auth v2) (Default RegionOne) region_name = RegionOne # The Object storage endpoint URL to use (auth v2) (Default internalURL) endpoint_type = internalURL # Concurrency to use for parallel tasks (Default 10) concurrency = 10 # Size of the HTTP pool (Default 10) http_pool_length = 10 # Timeout delay for HTTP connections (Default 20) http_timeout = 20 # Chunk size to read from pack (Bytes) (Default 12228) chunk_length = 12228 # Cache size (MBytes) (Default 20) cache_length = 20 """ class PackInfoMissingObjectFinder(GreenThreadsMissingObjectFinder): """Find missing objects required for pack generation.""" def next(self) -> tuple[bytes, int, bytes | None] | None: """Get the next missing object. Returns: Tuple of (sha, pack_type_num, name) or None if no more objects """ while True: if not self.objects_to_send: return None (sha, name, leaf, _) = self.objects_to_send.pop() if sha not in self.sha_done: break if not leaf: try: obj = self.object_store[sha] if isinstance(obj, Commit): self.add_todo([(obj.tree, b"", None, False)]) elif isinstance(obj, Tree): tree_items = [ ( item.sha, item.path if isinstance(item.path, bytes) else item.path.encode("utf-8") if item.path is not None else b"", None, False, ) for item in obj.items() if item.sha is not None ] self.add_todo(tree_items) elif isinstance(obj, Tag): self.add_todo([(obj.object[1], None, None, False)]) if sha in self._tagged: self.add_todo([(self._tagged[sha], None, None, True)]) except KeyError: pass self.sha_done.add(sha) self.progress(f"counting objects: {len(self.sha_done)}\r".encode()) return ( sha, 0, name if isinstance(name, bytes) else name.encode("utf-8") if name else None, ) def load_conf(path: str | None = None, file: str | None = None) -> ConfigParser: """Load configuration in global var CONF. Args: path: The path to the configuration file file: If provided read instead the file like object """ conf = ConfigParser() if file: conf.read_file(file, path) else: confpath = None if not path: try: confpath = os.environ["DULWICH_SWIFT_CFG"] except KeyError as exc: raise Exception("You need to specify a configuration file") from exc else: confpath = path if not os.path.isfile(confpath): raise Exception(f"Unable to read configuration file {confpath}") conf.read(confpath) return conf def swift_load_pack_index( scon: "SwiftConnector", filename: str, object_format: ObjectFormat ) -> "PackIndex": """Read a pack index file from Swift. Args: scon: a `SwiftConnector` instance filename: Path to the index file objectise object_format: Object format for this pack Returns: a `PackIndexer` instance """ f = scon.get_object(filename) if f is None: raise Exception(f"Could not retrieve index file {filename}") if isinstance(f, bytes): f = BytesIO(f) return load_pack_index_file(filename, f, object_format) def pack_info_create(pack_data: "PackData", pack_index: "PackIndex") -> bytes: """Create pack info file contents. Args: pack_data: The pack data object pack_index: The pack index object Returns: Compressed JSON bytes containing pack information """ pack = Pack.from_objects(pack_data, pack_index) info: dict[bytes, Any] = {} for obj in pack.iterobjects(): # Commit if obj.type_num == Commit.type_num: commit_obj = obj assert isinstance(commit_obj, Commit) info[obj.id] = (obj.type_num, commit_obj.parents, commit_obj.tree) # Tree elif obj.type_num == Tree.type_num: tree_obj = obj assert isinstance(tree_obj, Tree) shas = [ (s, n, not stat.S_ISDIR(m)) for n, m, s in tree_obj.items() if m is not None and not S_ISGITLINK(m) ] info[obj.id] = (obj.type_num, shas) # Blob elif obj.type_num == Blob.type_num: info[obj.id] = (obj.type_num,) # Tag elif obj.type_num == Tag.type_num: tag_obj = obj assert isinstance(tag_obj, Tag) info[obj.id] = (obj.type_num, tag_obj.object[1]) return zlib.compress(json.dumps(info).encode("utf-8")) def load_pack_info( filename: str, scon: "SwiftConnector | None" = None, file: BinaryIO | None = None, ) -> dict[str, Any] | None: """Load pack info from Swift or file. Args: filename: The pack info filename scon: Optional Swift connector to use for loading file: Optional file object to read from instead Returns: Dictionary containing pack information or None if not found """ if not file: if scon is None: return None obj = scon.get_object(filename) if obj is None: return None if isinstance(obj, bytes): return cast(dict[str, Any], json.loads(zlib.decompress(obj))) else: f: BinaryIO = obj else: f = file try: return cast(dict[str, Any], json.loads(zlib.decompress(f.read()))) finally: if hasattr(f, "close"): f.close() class SwiftException(Exception): """Exception raised for Swift-related errors.""" class SwiftConnector: """A Connector to swift that manage authentication and errors catching.""" def __init__(self, root: str, conf: ConfigParser) -> None: """Initialize a SwiftConnector. Args: root: The swift container that will act as Git bare repository conf: A ConfigParser Object """ self.conf = conf self.auth_ver = self.conf.get("swift", "auth_ver") if self.auth_ver not in ["1", "2"]: raise NotImplementedError("Wrong authentication version use either 1 or 2") self.auth_url = self.conf.get("swift", "auth_url") self.user = self.conf.get("swift", "username") self.password = self.conf.get("swift", "password") self.concurrency = self.conf.getint("swift", "concurrency") or 10 self.http_timeout = self.conf.getint("swift", "http_timeout") or 20 self.http_pool_length = self.conf.getint("swift", "http_pool_length") or 10 self.region_name = self.conf.get("swift", "region_name") or "RegionOne" self.endpoint_type = self.conf.get("swift", "endpoint_type") or "internalURL" self.cache_length = self.conf.getint("swift", "cache_length") or 20 self.chunk_length = self.conf.getint("swift", "chunk_length") or 12228 self.root = root block_size = 1024 * 12 # 12KB if self.auth_ver == "1": self.storage_url, self.token = self.swift_auth_v1() else: self.storage_url, self.token = self.swift_auth_v2() token_header = {"X-Auth-Token": str(self.token)} self.httpclient = HTTPClient.from_url( str(self.storage_url), concurrency=self.http_pool_length, block_size=block_size, connection_timeout=self.http_timeout, network_timeout=self.http_timeout, headers=token_header, ) self.base_path = str( posixpath.join(urlparse.urlparse(self.storage_url).path, self.root) ) def swift_auth_v1(self) -> tuple[str, str]: """Authenticate with Swift using v1 authentication. Returns: Tuple of (storage_url, auth_token) Raises: SwiftException: If authentication fails """ self.user = self.user.replace(";", ":") auth_httpclient = HTTPClient.from_url( self.auth_url, connection_timeout=self.http_timeout, network_timeout=self.http_timeout, ) headers = {"X-Auth-User": self.user, "X-Auth-Key": self.password} path = urlparse.urlparse(self.auth_url).path ret = auth_httpclient.request("GET", path, headers=headers) # Should do something with redirections (301 in my case) if ret.status_code < 200 or ret.status_code >= 300: raise SwiftException( "AUTH v1.0 request failed on " + f"{self.auth_url} with error code {ret.status_code} ({ret.items()!s})" ) storage_url = ret["X-Storage-Url"] token = ret["X-Auth-Token"] return storage_url, token def swift_auth_v2(self) -> tuple[str, str]: """Authenticate with Swift using v2 authentication. Returns: Tuple of (storage_url, auth_token) Raises: SwiftException: If authentication fails """ self.tenant, self.user = self.user.split(";") auth_dict = {} auth_dict["auth"] = { "passwordCredentials": { "username": self.user, "password": self.password, }, "tenantName": self.tenant, } auth_json = json.dumps(auth_dict) headers = {"Content-Type": "application/json"} auth_httpclient = HTTPClient.from_url( self.auth_url, connection_timeout=self.http_timeout, network_timeout=self.http_timeout, ) path = urlparse.urlparse(self.auth_url).path if not path.endswith("tokens"): path = posixpath.join(path, "tokens") ret = auth_httpclient.request("POST", path, body=auth_json, headers=headers) if ret.status_code < 200 or ret.status_code >= 300: raise SwiftException( "AUTH v2.0 request failed on " + f"{str(auth_httpclient.get_base_url()) + path} with error code {ret.status_code} ({ret.items()!s})" ) auth_ret_json = json.loads(ret.read()) token = auth_ret_json["access"]["token"]["id"] catalogs = auth_ret_json["access"]["serviceCatalog"] object_store = next( o_store for o_store in catalogs if o_store["type"] == "object-store" ) endpoints = object_store["endpoints"] endpoint = next( endp for endp in endpoints if endp["region"] == self.region_name ) return endpoint[self.endpoint_type], token def test_root_exists(self) -> bool | None: """Check that Swift container exist. Returns: True if exist or None it not """ ret = self.httpclient.request("HEAD", self.base_path) if ret.status_code == 404: return None if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"HEAD request failed with error code {ret.status_code}" ) return True def create_root(self) -> None: """Create the Swift container. Raises: SwiftException: if unable to create """ if not self.test_root_exists(): ret = self.httpclient.request("PUT", self.base_path) if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"PUT request failed with error code {ret.status_code}" ) def get_container_objects(self) -> list[dict[str, Any]] | None: """Retrieve objects list in a container. Returns: A list of dict that describe objects or None if container does not exist """ qs = "?format=json" path = self.base_path + qs ret = self.httpclient.request("GET", path) if ret.status_code == 404: return None if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"GET request failed with error code {ret.status_code}" ) content = ret.read() return cast(list[dict[str, Any]], json.loads(content)) def get_object_stat(self, name: str) -> dict[str, Any] | None: """Retrieve object stat. Args: name: The object name Returns: A dict that describe the object or None if object does not exist """ path = self.base_path + "/" + name ret = self.httpclient.request("HEAD", path) if ret.status_code == 404: return None if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"HEAD request failed with error code {ret.status_code}" ) resp_headers = {} for header, value in ret.items(): resp_headers[header.lower()] = value return resp_headers def put_object(self, name: str, content: BinaryIO) -> None: """Put an object. Args: name: The object name content: A file object Raises: SwiftException: if unable to create """ content.seek(0) data = content.read() path = self.base_path + "/" + name headers = {"Content-Length": str(len(data))} def _send() -> object: ret = self.httpclient.request("PUT", path, body=data, headers=headers) return ret try: # Sometime got Broken Pipe - Dirty workaround ret = _send() except (BrokenPipeError, ConnectionError): # Second attempt work ret = _send() if ret.status_code < 200 or ret.status_code > 300: # type: ignore raise SwiftException( f"PUT request failed with error code {ret.status_code}" # type: ignore ) def get_object(self, name: str, range: str | None = None) -> bytes | BytesIO | None: """Retrieve an object. Args: name: The object name range: A string range like "0-10" to retrieve specified bytes in object content Returns: A file like instance or bytestring if range is specified """ headers = {} if range: headers["Range"] = f"bytes={range}" path = self.base_path + "/" + name ret = self.httpclient.request("GET", path, headers=headers) if ret.status_code == 404: return None if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"GET request failed with error code {ret.status_code}" ) content = cast(bytes, ret.read()) if range: return content return BytesIO(content) def del_object(self, name: str) -> None: """Delete an object. Args: name: The object name Raises: SwiftException: if unable to delete """ path = self.base_path + "/" + name ret = self.httpclient.request("DELETE", path) if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"DELETE request failed with error code {ret.status_code}" ) def del_root(self) -> None: """Delete the root container by removing container content. Raises: SwiftException: if unable to delete """ objects = self.get_container_objects() if objects: for obj in objects: self.del_object(obj["name"]) ret = self.httpclient.request("DELETE", self.base_path) if ret.status_code < 200 or ret.status_code > 300: raise SwiftException( f"DELETE request failed with error code {ret.status_code}" ) class SwiftPackReader: """A SwiftPackReader that mimic read and sync method. The reader allows to read a specified amount of bytes from a given offset of a Swift object. A read offset is kept internally. The reader will read from Swift a specified amount of data to complete its internal buffer. chunk_length specify the amount of data to read from Swift. """ def __init__(self, scon: SwiftConnector, filename: str, pack_length: int) -> None: """Initialize a SwiftPackReader. Args: scon: a `SwiftConnector` instance filename: the pack filename pack_length: The size of the pack object """ self.scon = scon self.filename = filename self.pack_length = pack_length self.offset = 0 self.base_offset = 0 self.buff = b"" self.buff_length = self.scon.chunk_length def _read(self, more: bool = False) -> None: if more: self.buff_length = self.buff_length * 2 offset = self.base_offset r = min(self.base_offset + self.buff_length, self.pack_length) ret = self.scon.get_object(self.filename, range=f"{offset}-{r}") if ret is None: self.buff = b"" elif isinstance(ret, bytes): self.buff = ret else: self.buff = ret.read() def read(self, length: int) -> bytes: """Read a specified amount of Bytes form the pack object. Args: length: amount of bytes to read Returns: a bytestring """ end = self.offset + length if self.base_offset + end > self.pack_length: data = self.buff[self.offset :] self.offset = end return data if end > len(self.buff): # Need to read more from swift self._read(more=True) return self.read(length) data = self.buff[self.offset : end] self.offset = end return data def seek(self, offset: int) -> None: """Seek to a specified offset. Args: offset: the offset to seek to """ self.base_offset = offset self._read() self.offset = 0 def read_checksum(self) -> bytes: """Read the checksum from the pack. Returns: the checksum bytestring """ ret = self.scon.get_object(self.filename, range="-20") if ret is None: return b"" elif isinstance(ret, bytes): return ret else: return ret.read() class SwiftPackData(PackData): """The data contained in a packfile. We use the SwiftPackReader to read bytes from packs stored in Swift using the Range header feature of Swift. """ def __init__( self, scon: SwiftConnector, filename: str | os.PathLike[str], object_format: "ObjectFormat | None" = None, ) -> None: """Initialize a SwiftPackReader. Args: scon: a `SwiftConnector` instance filename: the pack filename object_format: Object format for this pack """ from dulwich.object_format import DEFAULT_OBJECT_FORMAT if object_format is None: import warnings warnings.warn( "SwiftPackData() should be called with object_format parameter", DeprecationWarning, stacklevel=2, ) object_format = DEFAULT_OBJECT_FORMAT self.object_format = object_format self.scon = scon self._filename = filename self._header_size = 12 headers = self.scon.get_object_stat(str(self._filename)) if headers is None: raise Exception(f"Could not get stats for {self._filename}") self.pack_length = int(headers["content-length"]) pack_reader = SwiftPackReader(self.scon, str(self._filename), self.pack_length) (_version, self._num_objects) = read_pack_header(pack_reader.read) self._offset_cache = LRUSizeCache( 1024 * 1024 * self.scon.cache_length, compute_size=_compute_object_size, ) self.pack = None def get_object_at( self, offset: int ) -> tuple[int, tuple[bytes | int, list[bytes]] | list[bytes]]: """Get the object at a specific offset in the pack. Args: offset: The offset in the pack file Returns: Tuple of (pack_type_num, object_data) """ if offset in self._offset_cache: return self._offset_cache[offset] assert offset >= self._header_size pack_reader = SwiftPackReader(self.scon, str(self._filename), self.pack_length) pack_reader.seek(offset) unpacked, _ = unpack_object(pack_reader.read, self.object_format.hash_func) obj_data = unpacked._obj() return (unpacked.pack_type_num, obj_data) def get_stored_checksum(self) -> bytes: """Get the stored checksum for this pack. Returns: The pack checksum as bytes """ pack_reader = SwiftPackReader(self.scon, str(self._filename), self.pack_length) return pack_reader.read_checksum() def close(self) -> None: """Close the pack data (no-op for Swift).""" class SwiftPack(Pack): """A Git pack object. Same implementation as pack.Pack except that _idx_load and _data_load are bounded to Swift version of load_pack_index and PackData. """ def __init__(self, *args: object, **kwargs: object) -> None: """Initialize SwiftPack. Args: *args: Arguments to pass to parent class **kwargs: Keyword arguments, must include 'scon' (SwiftConnector) """ self.scon: SwiftConnector = kwargs["scon"] # type: ignore del kwargs["scon"] super().__init__(*args, **kwargs) # type: ignore self._pack_info_path = self._basename + ".info" self._pack_info: dict[str, Any] | None = None self._pack_info_load: Callable[[], dict[str, Any] | None] = ( lambda: load_pack_info(self._pack_info_path, self.scon) ) self._idx_load = lambda: swift_load_pack_index( self.scon, self._idx_path, self.object_format ) self._data_load = lambda: SwiftPackData(self.scon, self._data_path) @property def pack_info(self) -> dict[str, Any] | None: """The pack data object being used.""" if self._pack_info is None: self._pack_info = self._pack_info_load() return self._pack_info class SwiftObjectStore(PackBasedObjectStore): """A Swift Object Store. Allow to manage a bare Git repository from Openstack Swift. This object store only supports pack files and not loose objects. """ def __init__(self, scon: SwiftConnector) -> None: """Open a Swift object store. Args: scon: A `SwiftConnector` instance """ super().__init__() self.scon = scon self.root = self.scon.root self.pack_dir = posixpath.join(OBJECTDIR, PACKDIR) self._alternates = None def _update_pack_cache(self) -> list[Any]: objects = self.scon.get_container_objects() if objects is None: return [] pack_files = [ o["name"].replace(".pack", "") for o in objects if o["name"].endswith(".pack") ] ret = [] for basename in pack_files: pack = SwiftPack(basename, object_format=self.object_format, scon=self.scon) self._pack_cache[basename] = pack ret.append(pack) return ret def _iter_loose_objects(self) -> Iterator[Any]: """Loose objects are not supported by this repository.""" return iter([]) def pack_info_get(self, sha: ObjectID) -> tuple[Any, ...] | None: """Get pack info for a specific SHA. Args: sha: The SHA to look up Returns: Pack info tuple or None if not found """ for pack in self.packs: if sha in pack: if hasattr(pack, "pack_info"): pack_info = pack.pack_info if pack_info is not None: return cast(tuple[Any, ...] | None, pack_info.get(sha)) return None def _collect_ancestors( self, heads: list[Any], common: set[Any] | None = None ) -> tuple[set[Any], set[Any]]: if common is None: common = set() def _find_parents(commit: ObjectID) -> list[Any]: for pack in self.packs: if commit in pack: try: if hasattr(pack, "pack_info"): pack_info = pack.pack_info if pack_info is not None: return cast(list[Any], pack_info[commit][1]) except KeyError: # Seems to have no parents return [] return [] bases = set() commits = set() queue = [] queue.extend(heads) while queue: e = queue.pop(0) if e in common: bases.add(e) elif e not in commits: commits.add(e) parents = _find_parents(e) queue.extend(parents) return (commits, bases) def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]: """Add a new pack to this object store. Returns: Fileobject to write to and a commit function to call when the pack is finished. """ f = BytesIO() def commit() -> "SwiftPack | None": """Commit the pack to Swift storage. Returns: The created SwiftPack or None if empty """ f.seek(0) from typing import cast from ..file import _GitFile pack = PackData( file=cast(_GitFile, f), filename="", object_format=self.object_format ) entries = pack.sorted_entries() if entries: basename = posixpath.join( self.pack_dir, f"pack-{iter_sha1(entry[0] for entry in entries).decode('ascii')}", ) index = BytesIO() write_pack_index_v2(index, entries, pack.get_stored_checksum()) self.scon.put_object(basename + ".pack", f) f.close() self.scon.put_object(basename + ".idx", index) index.close() final_pack = SwiftPack( basename, object_format=self.object_format, scon=self.scon ) final_pack.check_length_and_checksum() self._add_cached_pack(basename, final_pack) return final_pack else: return None def abort() -> None: """Abort the pack operation (no-op).""" def commit_wrapper() -> None: """Wrapper that discards the return value.""" commit() return f, commit_wrapper, abort def add_object(self, obj: object) -> None: """Add a single object to the store. Args: obj: The object to add """ self.add_objects( [ (obj, None), # type: ignore ] ) def _pack_cache_stale(self) -> bool: return False def _get_loose_object(self, sha: bytes) -> None: return None def add_thin_pack( self, read_all: Callable[[int], bytes], read_some: Callable[[int], bytes] ) -> "SwiftPack": """Read a thin pack. Read it from a stream and complete it in a temporary file. Then the pack and the corresponding index file are uploaded to Swift. """ fd, path = tempfile.mkstemp(prefix="tmp_pack_") f = os.fdopen(fd, "w+b") try: pack_data = PackData( file=cast(_GitFile, f), filename=path, object_format=self.object_format ) indexer = PackIndexer( cast(BinaryIO, pack_data._file), self.object_format.hash_func, resolve_ext_ref=None, ) copier = PackStreamCopier( self.object_format.hash_func, read_all, read_some, f, delta_iter=None ) copier.verify() return self._complete_thin_pack(f, path, copier, indexer) finally: f.close() os.unlink(path) def _complete_thin_pack( self, f: BinaryIO, path: str, copier: object, indexer: object ) -> "SwiftPack": entries = list(indexer) # type: ignore # Update the header with the new number of objects. f.seek(0) write_pack_header(f, len(entries) + len(indexer.ext_refs())) # type: ignore # Must flush before reading (http://bugs.python.org/issue3207) f.flush() # Rescan the rest of the pack, computing the SHA with the new header. new_sha = compute_file_sha( f, hash_func=self.object_format.hash_func, end_ofs=-self.object_format.oid_length, ) # Must reposition before writing (http://bugs.python.org/issue3207) f.seek(0, os.SEEK_CUR) # Complete the pack. for ext_sha in indexer.ext_refs(): # type: ignore assert len(ext_sha) in (20, 32) # SHA-1 or SHA-256 type_num, data = self.get_raw(ext_sha) offset = f.tell() crc32 = write_pack_object(f, type_num, data, sha=new_sha) # type: ignore entries.append((ext_sha, offset, crc32)) pack_sha = new_sha.digest() f.write(pack_sha) f.flush() # Move the pack in. entries.sort() pack_base_name = posixpath.join( self.pack_dir, "pack-" + os.fsdecode(iter_sha1(e[0] for e in entries)), ) self.scon.put_object(pack_base_name + ".pack", f) # Write the index. filename = pack_base_name + ".idx" index_file = BytesIO() write_pack_index_v2(index_file, entries, pack_sha) self.scon.put_object(filename, index_file) # Write pack info. f.seek(0) pack_data = PackData( filename="", file=cast(_GitFile, f), object_format=self.object_format ) index_file.seek(0) pack_index = load_pack_index_file("", index_file, self.object_format) serialized_pack_info = pack_info_create(pack_data, pack_index) f.close() index_file.close() pack_info_file = BytesIO(serialized_pack_info) filename = pack_base_name + ".info" self.scon.put_object(filename, pack_info_file) pack_info_file.close() # Add the pack to the store and return it. final_pack = SwiftPack( pack_base_name, object_format=self.object_format, scon=self.scon ) final_pack.check_length_and_checksum() self._add_cached_pack(pack_base_name, final_pack) return final_pack class SwiftInfoRefsContainer(RefsContainer): """Manage references in info/refs object.""" def __init__(self, scon: SwiftConnector, store: object) -> None: """Initialize SwiftInfoRefsContainer. Args: scon: Swift connector instance store: Object store instance """ self.scon = scon self.filename = "info/refs" self.store = store f = self.scon.get_object(self.filename) if not f: f = BytesIO(b"") elif isinstance(f, bytes): f = BytesIO(f) # Initialize refs from info/refs file self._refs: dict[Ref, ObjectID] = {} self._peeled: dict[Ref, ObjectID] = {} refs = read_info_refs(f) (self._refs, self._peeled) = split_peeled_refs(refs) def _load_check_ref( self, name: Ref, old_ref: ObjectID | None ) -> dict[Ref, ObjectID] | bool: self._check_refname(name) obj = self.scon.get_object(self.filename) if not obj: return {} if isinstance(obj, bytes): f = BytesIO(obj) else: f = obj refs = read_info_refs(f) (refs, _peeled) = split_peeled_refs(refs) if old_ref is not None: if refs[name] != old_ref: return False return refs def _write_refs(self, refs: Mapping[Ref, ObjectID]) -> None: f = BytesIO() f.writelines(write_info_refs(refs, cast("ObjectContainer", self.store))) self.scon.put_object(self.filename, f) def set_if_equals( self, name: Ref, old_ref: ObjectID | None, new_ref: ObjectID, committer: bytes | None = None, timestamp: float | None = None, timezone: int | None = None, message: bytes | None = None, ) -> bool: """Set a refname to new_ref only if it currently equals old_ref.""" if name == HEADREF: return True refs = self._load_check_ref(name, old_ref) if not isinstance(refs, dict): return False refs[name] = new_ref self._write_refs(refs) self._refs[name] = new_ref return True def remove_if_equals( self, name: Ref, old_ref: ObjectID | None, committer: object = None, timestamp: object = None, timezone: object = None, message: object = None, ) -> bool: """Remove a refname only if it currently equals old_ref.""" if name == HEADREF: return True refs = self._load_check_ref(name, old_ref) if not isinstance(refs, dict): return False del refs[name] self._write_refs(refs) del self._refs[name] return True def read_loose_ref(self, name: Ref) -> bytes | None: """Read a loose reference.""" return self._refs.get(name, None) def get_packed_refs(self) -> dict[Ref, ObjectID]: """Get packed references.""" return {} def get_peeled(self, name: Ref) -> ObjectID | None: """Get peeled version of a reference.""" try: return self._peeled[name] except KeyError: ref_value = self._refs.get(name) # Only return if it's an ObjectID (not a symref) if isinstance(ref_value, bytes) and len(ref_value) == 40: return ObjectID(ref_value) return None def allkeys(self) -> set[Ref]: """Get all reference names. Returns: Set of reference names as Ref """ try: self._refs[HEADREF] = self._refs[Ref(b"refs/heads/master")] except KeyError: pass return set(self._refs.keys()) class SwiftRepo(BaseRepo): """A Git repository backed by Swift object storage.""" def __init__(self, root: str, conf: ConfigParser) -> None: """Init a Git bare Repository on top of a Swift container. References are managed in info/refs objects by `SwiftInfoRefsContainer`. The root attribute is the Swift container that contain the Git bare repository. Args: root: The container which contains the bare repo conf: A ConfigParser object """ self.root = root.lstrip("/") self.conf = conf self.scon = SwiftConnector(self.root, self.conf) objects = self.scon.get_container_objects() if not objects: raise Exception(f"There is not any GIT repo here : {self.root}") object_names = [o["name"].split("/")[0] for o in objects] if OBJECTDIR not in object_names: raise Exception(f"This repository ({self.root}) is not bare.") self.bare = True self._controldir = self.root object_store = SwiftObjectStore(self.scon) refs = SwiftInfoRefsContainer(self.scon, object_store) BaseRepo.__init__(self, object_store, refs) def _determine_file_mode(self) -> bool: """Probe the file-system to determine whether permissions can be trusted. Returns: True if permissions can be trusted, False otherwise. """ return False def _put_named_file(self, filename: str, contents: bytes) -> None: """Put an object in a Swift container. Args: filename: the path to the object to put on Swift contents: the content as bytestring """ with BytesIO() as f: f.write(contents) self.scon.put_object(filename, f) @classmethod def init_bare(cls, scon: SwiftConnector, conf: ConfigParser) -> "SwiftRepo": """Create a new bare repository. Args: scon: a `SwiftConnector` instance conf: a ConfigParser object Returns: a `SwiftRepo` instance """ scon.create_root() for obj in [ posixpath.join(OBJECTDIR, PACKDIR), posixpath.join(INFODIR, "refs"), ]: scon.put_object(obj, BytesIO(b"")) ret = cls(scon.root, conf) ret._init_files(True) return ret class SwiftSystemBackend(Backend): """Backend for serving Git repositories from Swift.""" def __init__(self, logger: "logging.Logger", conf: ConfigParser) -> None: """Initialize SwiftSystemBackend. Args: logger: Logger instance conf: Configuration parser instance """ self.conf = conf self.logger = logger def open_repository(self, path: str) -> "BackendRepo": """Open a repository at the given path. Args: path: Path to the repository in Swift Returns: SwiftRepo instance """ self.logger.info("opening repository at %s", path) return cast("BackendRepo", SwiftRepo(path, self.conf)) def cmd_daemon(args: list[str]) -> None: """Start a TCP git server for Swift repositories. Args: args: Command line arguments """ import optparse parser = optparse.OptionParser() parser.add_option( "-l", "--listen_address", dest="listen_address", default="127.0.0.1", help="Binding IP address.", ) parser.add_option( "-p", "--port", dest="port", type=int, default=TCP_GIT_PORT, help="Binding TCP port.", ) parser.add_option( "-c", "--swift_config", dest="swift_config", default="", help="Path to the configuration file for Swift backend.", ) options, args = parser.parse_args(args) try: import gevent import geventhttpclient # noqa: F401 except ImportError: print( "gevent and geventhttpclient libraries are mandatory " " for use the Swift backend." ) sys.exit(1) import gevent.monkey gevent.monkey.patch_socket() from dulwich import log_utils logger = log_utils.getLogger(__name__) conf = load_conf(options.swift_config) backend = SwiftSystemBackend(logger, conf) log_utils.default_logging_config() server = TCPGitServer(backend, options.listen_address, port=options.port) server.serve_forever() def cmd_init(args: list[str]) -> None: """Initialize a new Git repository in Swift. Args: args: Command line arguments """ import optparse parser = optparse.OptionParser() parser.add_option( "-c", "--swift_config", dest="swift_config", default="", help="Path to the configuration file for Swift backend.", ) options, args = parser.parse_args(args) conf = load_conf(options.swift_config) if args == []: parser.error("missing repository name") repo = args[0] scon = SwiftConnector(repo, conf) SwiftRepo.init_bare(scon, conf) def main(argv: list[str] = sys.argv) -> None: """Main entry point for Swift Git command line interface. Args: argv: Command line arguments """ commands = { "init": cmd_init, "daemon": cmd_daemon, } if len(argv) < 2: print("Usage: {} <{}> [OPTIONS...]".format(argv[0], "|".join(commands.keys()))) sys.exit(1) cmd = argv[1] if cmd not in commands: print(f"No such subcommand: {cmd}") sys.exit(1) commands[cmd](argv[2:]) if __name__ == "__main__": main() dulwich-1.0.0/dulwich/credentials.py000066400000000000000000000070011513301442600174520ustar00rootroot00000000000000# credentials.py -- support for git credential helpers # Copyright (C) 2022 Daniele Trifirò # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Support for git credential helpers. https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage """ __all__ = [ "match_partial_url", "match_urls", "urlmatch_credential_sections", ] import sys from collections.abc import Iterator from urllib.parse import ParseResult, urlparse from .config import ConfigDict, SectionLike def match_urls(url: ParseResult, url_prefix: ParseResult) -> bool: """Check if a URL matches a URL prefix. Args: url: Parsed URL to check url_prefix: Parsed URL prefix to match against Returns: True if url matches the prefix """ base_match = ( url.scheme == url_prefix.scheme and url.hostname == url_prefix.hostname and url.port == url_prefix.port ) user_match = url.username == url_prefix.username if url_prefix.username else True path_match = url.path.rstrip("/").startswith(url_prefix.path.rstrip()) return base_match and user_match and path_match def match_partial_url(valid_url: ParseResult, partial_url: str) -> bool: """Matches a parsed url with a partial url (no scheme/netloc).""" if "://" not in partial_url: parsed = urlparse("scheme://" + partial_url) else: parsed = urlparse(partial_url) if valid_url.scheme != parsed.scheme: return False if any( ( (parsed.hostname and valid_url.hostname != parsed.hostname), (parsed.username and valid_url.username != parsed.username), (parsed.port and valid_url.port != parsed.port), (parsed.path and parsed.path.rstrip("/") != valid_url.path.rstrip("/")), ), ): return False return True def urlmatch_credential_sections( config: ConfigDict, url: str | None ) -> Iterator[SectionLike]: """Returns credential sections from the config which match the given URL.""" encoding = config.encoding or sys.getdefaultencoding() parsed_url = urlparse(url or "") for config_section in config.sections(): if config_section[0] != b"credential": continue if len(config_section) < 2: yield config_section continue config_url = config_section[1].decode(encoding) parsed_config_url = urlparse(config_url) if parsed_config_url.scheme and parsed_config_url.netloc: is_match = match_urls(parsed_url, parsed_config_url) else: is_match = match_partial_url(parsed_url, config_url) if is_match: yield config_section dulwich-1.0.0/dulwich/diff.py000066400000000000000000000642431513301442600161000ustar00rootroot00000000000000# diff.py -- Diff functionality for Dulwich # Copyright (C) 2025 Dulwich contributors # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Diff functionality with separate codepaths. This module provides three main functions for different diff scenarios: 1. diff_index_to_tree: Shows staged changes (index vs commit) Used by: git diff --staged, git diff --cached 2. diff_working_tree_to_tree: Shows all changes from a commit to working tree Used by: git diff 3. diff_working_tree_to_index: Shows unstaged changes (working tree vs index) Used by: git diff (with no arguments) Example usage: from dulwich.repo import Repo from dulwich.diff import diff_index_to_tree import sys repo = Repo('.') # Show staged changes diff_index_to_tree(repo, sys.stdout.buffer) # Show changes in specific paths only diff_index_to_tree(repo, sys.stdout.buffer, paths=[b'src/', b'README.md']) """ __all__ = [ "ColorizedDiffStream", "diff_index_to_tree", "diff_working_tree_to_index", "diff_working_tree_to_tree", "should_include_path", ] import io import logging import os import stat from collections.abc import Iterable, Sequence from typing import BinaryIO from ._typing import Buffer from .index import ConflictedIndexEntry, commit_index from .object_store import iter_tree_contents from .objects import S_ISGITLINK, Blob, Commit, ObjectID from .patch import write_blob_diff, write_object_diff from .repo import Repo logger = logging.getLogger(__name__) def should_include_path(path: bytes, paths: Sequence[bytes] | None) -> bool: """Check if a path should be included based on path filters. Args: path: The path to check paths: List of path filters, or None for no filtering Returns: True if the path should be included """ if not paths: return True return any(path == p or path.startswith(p + b"/") for p in paths) def diff_index_to_tree( repo: Repo, outstream: BinaryIO, commit_sha: ObjectID | None = None, paths: Sequence[bytes] | None = None, diff_algorithm: str | None = None, ) -> None: """Show staged changes (index vs commit). Args: repo: Repository object outstream: Stream to write diff to commit_sha: SHA of commit to compare against, or None for HEAD paths: Optional list of paths to filter (as bytes) diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None """ if commit_sha is None: try: from dulwich.refs import HEADREF commit_sha = repo.refs[HEADREF] old_commit = repo[commit_sha] assert isinstance(old_commit, Commit) old_tree = old_commit.tree except KeyError: # No HEAD means no commits yet old_tree = None else: old_commit = repo[commit_sha] assert isinstance(old_commit, Commit) old_tree = old_commit.tree # Get tree from index index = repo.open_index() new_tree = commit_index(repo.object_store, index) changes = repo.object_store.tree_changes(old_tree, new_tree, paths=paths) for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in changes: write_object_diff( outstream, repo.object_store, (oldpath, oldmode, oldsha), (newpath, newmode, newsha), diff_algorithm=diff_algorithm, ) def diff_working_tree_to_tree( repo: Repo, outstream: BinaryIO, commit_sha: ObjectID, paths: Sequence[bytes] | None = None, diff_algorithm: str | None = None, ) -> None: """Compare working tree to a specific commit. Args: repo: Repository object outstream: Stream to write diff to commit_sha: SHA of commit to compare against paths: Optional list of paths to filter (as bytes) diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None """ commit = repo[commit_sha] assert isinstance(commit, Commit) tree = commit.tree normalizer = repo.get_blob_normalizer() filter_callback = normalizer.checkin_normalize if normalizer is not None else None # Get index for tracking new files index = repo.open_index() index_paths = set(index.paths()) processed_paths = set() # Process files from the committed tree lazily for entry in iter_tree_contents(repo.object_store, tree): assert ( entry.path is not None and entry.mode is not None and entry.sha is not None ) path = entry.path if not should_include_path(path, paths): continue processed_paths.add(path) full_path = os.path.join(repo.path, path.decode("utf-8")) # Get the old file from tree old_mode = entry.mode old_sha = entry.sha old_blob = repo.object_store[old_sha] assert isinstance(old_blob, Blob) try: # Use lstat to handle symlinks properly st = os.lstat(full_path) except FileNotFoundError: # File was deleted if old_blob is not None: write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) except PermissionError: logger.warning("%s: Permission denied", path.decode()) # Show as deletion if it was in tree if old_blob is not None: write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) except OSError as e: logger.warning("%s: %s", path.decode(), e) # Show as deletion if it was in tree if old_blob is not None: write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) else: # Handle different file types if stat.S_ISDIR(st.st_mode): if old_blob is not None: # Directory in working tree where file was expected if stat.S_ISLNK(old_mode): logger.warning("%s: symlink became a directory", path.decode()) else: logger.warning("%s: file became a directory", path.decode()) # Show as deletion write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) # If old_blob is None, it's a new directory - skip it continue elif stat.S_ISLNK(st.st_mode): # Symlink in working tree target = os.readlink(full_path).encode("utf-8") new_blob = Blob() new_blob.data = target if old_blob is None: # New symlink write_blob_diff( outstream, (None, None, None), (path, stat.S_IFLNK | 0o777, new_blob), ) elif not stat.S_ISLNK(old_mode): # Type change: file/submodule -> symlink write_blob_diff( outstream, (path, old_mode, old_blob), (path, stat.S_IFLNK | 0o777, new_blob), ) elif old_blob is not None and old_blob.data != target: # Symlink target changed write_blob_diff( outstream, (path, old_mode, old_blob), (path, old_mode, new_blob), ) elif stat.S_ISREG(st.st_mode): # Regular file with open(full_path, "rb") as f: new_content = f.read() # Create a temporary blob for filtering and comparison new_blob = Blob() new_blob.data = new_content # Apply filters if needed (only for regular files, not gitlinks) if filter_callback is not None and ( old_blob is None or not S_ISGITLINK(old_mode) ): new_blob = filter_callback(new_blob, path) # Determine the git mode for the new file if st.st_mode & stat.S_IXUSR: new_git_mode = stat.S_IFREG | 0o755 else: new_git_mode = stat.S_IFREG | 0o644 if old_blob is None: # New file write_blob_diff( outstream, (None, None, None), (path, new_git_mode, new_blob) ) elif stat.S_ISLNK(old_mode): # Symlink -> file write_blob_diff( outstream, (path, old_mode, old_blob), (path, new_git_mode, new_blob), ) elif S_ISGITLINK(old_mode): # Submodule -> file write_blob_diff( outstream, (path, old_mode, old_blob), (path, new_git_mode, new_blob), ) else: # Regular file, check for content or mode changes old_git_mode = old_mode & (stat.S_IFREG | 0o777) if ( old_blob is not None and old_blob.data != new_blob.data ) or old_git_mode != new_git_mode: write_blob_diff( outstream, (path, old_mode, old_blob), (path, new_git_mode, new_blob), ) elif stat.S_ISFIFO(st.st_mode): logger.warning("%s: unsupported file type (fifo)", path.decode()) if old_blob is not None: write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) elif stat.S_ISSOCK(st.st_mode): logger.warning("%s: unsupported file type (socket)", path.decode()) if old_blob is not None: write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) else: logger.warning("%s: unsupported file type", path.decode()) if old_blob is not None: write_blob_diff( outstream, (path, old_mode, old_blob), (None, None, None) ) # Now process any new files from index that weren't in the tree for path in sorted(index_paths - processed_paths): if not should_include_path(path, paths): continue full_path = os.path.join(repo.path, path.decode("utf-8")) try: # Use lstat to handle symlinks properly st = os.lstat(full_path) except FileNotFoundError: # New file already deleted, skip continue except PermissionError: logger.warning("%s: Permission denied", path.decode()) continue except OSError as e: logger.warning("%s: %s", path.decode(), e) continue # Handle different file types for new files if stat.S_ISDIR(st.st_mode): # New directory - skip it continue elif stat.S_ISLNK(st.st_mode): # New symlink target = os.readlink(full_path).encode("utf-8") new_blob = Blob() new_blob.data = target write_blob_diff( outstream, (None, None, None), (path, stat.S_IFLNK | 0o777, new_blob), ) elif stat.S_ISREG(st.st_mode): # New regular file with open(full_path, "rb") as f: new_content = f.read() new_blob = Blob() new_blob.data = new_content # Apply filters if needed if filter_callback is not None: new_blob = filter_callback(new_blob, path) # Determine the git mode for the new file if st.st_mode & stat.S_IXUSR: new_git_mode = 0o100755 else: new_git_mode = 0o100644 write_blob_diff( outstream, (None, None, None), (path, new_git_mode, new_blob) ) elif stat.S_ISFIFO(st.st_mode): logger.warning("%s: unsupported file type (fifo)", path.decode()) elif stat.S_ISSOCK(st.st_mode): logger.warning("%s: unsupported file type (socket)", path.decode()) else: logger.warning("%s: unsupported file type", path.decode()) def diff_working_tree_to_index( repo: Repo, outstream: BinaryIO, paths: Sequence[bytes] | None = None, diff_algorithm: str | None = None, ) -> None: """Compare working tree to index. Args: repo: Repository object outstream: Stream to write diff to paths: Optional list of paths to filter (as bytes) diff_algorithm: Algorithm to use for diffing ("myers" or "patience"), defaults to DEFAULT_DIFF_ALGORITHM if None """ index = repo.open_index() normalizer = repo.get_blob_normalizer() filter_callback = normalizer.checkin_normalize if normalizer is not None else None # Process each file in the index for tree_path, entry in index.iteritems(): if not should_include_path(tree_path, paths): continue # Handle conflicted entries by using stage 2 ("ours") if isinstance(entry, ConflictedIndexEntry): if entry.this is None: continue # No stage 2 entry, skip old_mode = entry.this.mode old_sha = entry.this.sha else: # Get file from regular index entry old_mode = entry.mode old_sha = entry.sha old_obj = repo.object_store[old_sha] # Type check and cast to Blob if isinstance(old_obj, Blob): old_blob = old_obj else: old_blob = None full_path = os.path.join(repo.path, tree_path.decode("utf-8")) try: # Use lstat to handle symlinks properly st = os.lstat(full_path) # Handle different file types if stat.S_ISDIR(st.st_mode): # Directory in working tree where file was expected if stat.S_ISLNK(old_mode): logger.warning("%s: symlink became a directory", tree_path.decode()) else: logger.warning("%s: file became a directory", tree_path.decode()) # Show as deletion write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) elif stat.S_ISLNK(st.st_mode): # Symlink in working tree target = os.readlink(full_path).encode("utf-8") new_blob = Blob() new_blob.data = target # Check if type changed or content changed if not stat.S_ISLNK(old_mode): # Type change: file/submodule -> symlink write_blob_diff( outstream, (tree_path, old_mode, old_blob), (tree_path, stat.S_IFLNK | 0o777, new_blob), ) elif old_blob is not None and old_blob.data != target: # Symlink target changed write_blob_diff( outstream, (tree_path, old_mode, old_blob), (tree_path, old_mode, new_blob), ) elif stat.S_ISREG(st.st_mode): # Regular file with open(full_path, "rb") as f: new_content = f.read() # Create a temporary blob for filtering and comparison new_blob = Blob() new_blob.data = new_content # Apply filters if needed (only for regular files) if filter_callback is not None and not S_ISGITLINK(old_mode): new_blob = filter_callback(new_blob, tree_path) # Determine the git mode for the new file if st.st_mode & stat.S_IXUSR: new_git_mode = stat.S_IFREG | 0o755 else: new_git_mode = stat.S_IFREG | 0o644 # Check if this was a type change if stat.S_ISLNK(old_mode): # Symlink -> file write_blob_diff( outstream, (tree_path, old_mode, old_blob), (tree_path, new_git_mode, new_blob), ) elif S_ISGITLINK(old_mode): # Submodule -> file write_blob_diff( outstream, (tree_path, old_mode, old_blob), (tree_path, new_git_mode, new_blob), ) else: # Regular file, check for content or mode changes old_git_mode = old_mode & (stat.S_IFREG | 0o777) if ( old_blob is not None and old_blob.data != new_blob.data ) or old_git_mode != new_git_mode: write_blob_diff( outstream, (tree_path, old_mode, old_blob), (tree_path, new_git_mode, new_blob), ) elif stat.S_ISFIFO(st.st_mode): logger.warning("%s: unsupported file type (fifo)", tree_path.decode()) write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) elif stat.S_ISSOCK(st.st_mode): logger.warning("%s: unsupported file type (socket)", tree_path.decode()) write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) else: logger.warning("%s: unsupported file type", tree_path.decode()) write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) except FileNotFoundError: # File was deleted - this is normal, not a warning write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) except PermissionError: logger.warning("%s: Permission denied", tree_path.decode()) # Show as deletion since we can't read it write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) except OSError as e: logger.warning("%s: %s", tree_path.decode(), e) # Show as deletion since we can't read it write_blob_diff( outstream, (tree_path, old_mode, old_blob), (None, None, None) ) class ColorizedDiffStream(BinaryIO): """Stream wrapper that colorizes diff output line by line using Rich. This class wraps a binary output stream and applies color formatting to diff output as it's written. It processes data line by line to enable streaming colorization without buffering the entire diff. """ @staticmethod def is_available() -> bool: """Check if Rich is available for colorization. Returns: bool: True if Rich can be imported, False otherwise """ try: import importlib.util return importlib.util.find_spec("rich.console") is not None except ImportError: return False def __init__(self, output_stream: BinaryIO) -> None: """Initialize the colorized stream wrapper. Args: output_stream: The underlying binary stream to write to """ self.output_stream = output_stream import io from rich.console import Console # Rich expects a text stream, so we need to wrap our binary stream self.text_wrapper = io.TextIOWrapper( output_stream, encoding="utf-8", newline="" ) self.console = Console(file=self.text_wrapper, force_terminal=True) self.buffer = b"" def write(self, data: bytes | Buffer) -> int: # type: ignore[override,unused-ignore] """Write data to the stream, applying colorization. Args: data: Bytes to write Returns: Number of bytes written """ # Add new data to buffer if not isinstance(data, bytes): data = bytes(data) self.buffer += data # Process complete lines while b"\n" in self.buffer: line, self.buffer = self.buffer.split(b"\n", 1) self._colorize_and_write_line(line + b"\n") return len(data) def writelines(self, lines: Iterable[bytes | Buffer]) -> None: # type: ignore[override,unused-ignore] """Write a list of lines to the stream. Args: lines: Iterable of bytes to write """ for line in lines: self.write(line) def _colorize_and_write_line(self, line_bytes: bytes) -> None: """Apply color formatting to a single line and write it. Args: line_bytes: The line to colorize and write (as bytes) """ try: line = line_bytes.decode("utf-8", errors="replace") # Colorize based on diff line type if line.startswith("+") and not line.startswith("+++"): self.console.print(line, style="green", end="") elif line.startswith("-") and not line.startswith("---"): self.console.print(line, style="red", end="") elif line.startswith("@@"): self.console.print(line, style="cyan", end="") elif line.startswith(("+++", "---")): self.console.print(line, style="bold", end="") else: self.console.print(line, end="") except (UnicodeDecodeError, UnicodeEncodeError): # Fallback to raw output if we can't decode/encode the text self.output_stream.write(line_bytes) def flush(self) -> None: """Flush any remaining buffered content and the underlying stream.""" # Write any remaining buffer content if self.buffer: self._colorize_and_write_line(self.buffer) self.buffer = b"" # Flush the text wrapper and underlying stream if hasattr(self.text_wrapper, "flush"): self.text_wrapper.flush() if hasattr(self.output_stream, "flush"): self.output_stream.flush() # BinaryIO interface methods def close(self) -> None: """Close the stream.""" self.flush() if hasattr(self.output_stream, "close"): self.output_stream.close() @property def closed(self) -> bool: """Check if the stream is closed.""" return getattr(self.output_stream, "closed", False) def fileno(self) -> int: """Return the file descriptor.""" return self.output_stream.fileno() def isatty(self) -> bool: """Check if the stream is a TTY.""" return getattr(self.output_stream, "isatty", lambda: False)() def read(self, n: int = -1) -> bytes: """Read is not supported on this write-only stream.""" raise io.UnsupportedOperation("not readable") def readable(self) -> bool: """This stream is not readable.""" return False def readline(self, limit: int = -1) -> bytes: """Read is not supported on this write-only stream.""" raise io.UnsupportedOperation("not readable") def readlines(self, hint: int = -1) -> list[bytes]: """Read is not supported on this write-only stream.""" raise io.UnsupportedOperation("not readable") def seek(self, offset: int, whence: int = 0) -> int: """Seek is not supported on this stream.""" raise io.UnsupportedOperation("not seekable") def seekable(self) -> bool: """This stream is not seekable.""" return False def tell(self) -> int: """Tell is not supported on this stream.""" raise io.UnsupportedOperation("not seekable") def truncate(self, size: int | None = None) -> int: """Truncate is not supported on this stream.""" raise io.UnsupportedOperation("not truncatable") def writable(self) -> bool: """This stream is writable.""" return True def __enter__(self) -> "ColorizedDiffStream": """Context manager entry.""" return self def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: object | None, ) -> None: """Context manager exit.""" self.flush() def __iter__(self) -> "ColorizedDiffStream": """Iterator interface - not supported.""" raise io.UnsupportedOperation("not iterable") def __next__(self) -> bytes: """Iterator interface - not supported.""" raise io.UnsupportedOperation("not iterable") dulwich-1.0.0/dulwich/diff_tree.py000066400000000000000000000741371513301442600171220ustar00rootroot00000000000000# diff_tree.py -- Utilities for diffing files and trees. # Copyright (C) 2010 Google, Inc. # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Utilities for diffing files and trees.""" __all__ = [ "CHANGE_ADD", "CHANGE_COPY", "CHANGE_DELETE", "CHANGE_MODIFY", "CHANGE_RENAME", "CHANGE_UNCHANGED", "MAX_FILES", "RENAME_CHANGE_TYPES", "RENAME_THRESHOLD", "REWRITE_THRESHOLD", "RenameDetector", "TreeChange", "tree_changes", "tree_changes_for_merge", "walk_trees", ] import stat from collections import defaultdict from collections.abc import Callable, Iterator, Mapping, Sequence from collections.abc import Set as AbstractSet from io import BytesIO from itertools import chain from typing import TYPE_CHECKING, Any, NamedTuple, TypeVar from .object_store import BaseObjectStore from .objects import S_ISGITLINK, ObjectID, ShaFile, Tree, TreeEntry # TreeChange type constants. CHANGE_ADD = "add" CHANGE_MODIFY = "modify" CHANGE_DELETE = "delete" CHANGE_RENAME = "rename" CHANGE_COPY = "copy" CHANGE_UNCHANGED = "unchanged" RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY) # _NULL_ENTRY removed - using None instead _MAX_SCORE = 100 RENAME_THRESHOLD = 60 MAX_FILES = 200 REWRITE_THRESHOLD: int | None = None class TreeChange(NamedTuple): """Named tuple a single change between two trees.""" type: str old: TreeEntry | None new: TreeEntry | None @classmethod def add(cls, new: TreeEntry) -> "TreeChange": """Create a TreeChange for an added entry. Args: new: New tree entry Returns: TreeChange instance """ return cls(CHANGE_ADD, None, new) @classmethod def delete(cls, old: TreeEntry) -> "TreeChange": """Create a TreeChange for a deleted entry. Args: old: Old tree entry Returns: TreeChange instance """ return cls(CHANGE_DELETE, old, None) def _tree_entries(path: bytes, tree: Tree) -> list[TreeEntry]: result: list[TreeEntry] = [] if not tree: return result for entry in tree.iteritems(name_order=True): result.append(entry.in_path(path)) return result def _merge_entries( path: bytes, tree1: Tree, tree2: Tree ) -> list[tuple[TreeEntry | None, TreeEntry | None]]: """Merge the entries of two trees. Args: path: A path to prepend to all tree entry names. tree1: The first Tree object to iterate, or None. tree2: The second Tree object to iterate, or None. Returns: A list of pairs of TreeEntry objects for each pair of entries in the trees. If an entry exists in one tree but not the other, the other entry will be None. If both entries exist, they are guaranteed to match. """ entries1 = _tree_entries(path, tree1) entries2 = _tree_entries(path, tree2) i1 = i2 = 0 len1 = len(entries1) len2 = len(entries2) result: list[tuple[TreeEntry | None, TreeEntry | None]] = [] while i1 < len1 and i2 < len2: entry1 = entries1[i1] entry2 = entries2[i2] if entry1.path < entry2.path: result.append((entry1, None)) i1 += 1 elif entry1.path > entry2.path: result.append((None, entry2)) i2 += 1 else: result.append((entry1, entry2)) i1 += 1 i2 += 1 for i in range(i1, len1): result.append((entries1[i], None)) for i in range(i2, len2): result.append((None, entries2[i])) return result def _is_tree(entry: TreeEntry | None) -> bool: if entry is None or entry.mode is None: return False return stat.S_ISDIR(entry.mode) def walk_trees( store: BaseObjectStore, tree1_id: ObjectID | None, tree2_id: ObjectID | None, prune_identical: bool = False, paths: Sequence[bytes] | None = None, ) -> Iterator[tuple[TreeEntry | None, TreeEntry | None]]: """Recursively walk all the entries of two trees. Iteration is depth-first pre-order, as in e.g. os.walk. Args: store: An ObjectStore for looking up objects. tree1_id: The SHA of the first Tree object to iterate, or None. tree2_id: The SHA of the second Tree object to iterate, or None. prune_identical: If True, identical subtrees will not be walked. paths: Optional list of paths to filter to (as bytes). Returns: Iterator over Pairs of TreeEntry objects for each pair of entries in the trees and their subtrees recursively. If an entry exists in one tree but not the other, the other entry will be None. If both entries exist, they are guaranteed to match. """ # This could be fairly easily generalized to >2 trees if we find a use # case. entry1 = TreeEntry(b"", stat.S_IFDIR, tree1_id) if tree1_id else None entry2 = TreeEntry(b"", stat.S_IFDIR, tree2_id) if tree2_id else None todo: list[tuple[TreeEntry | None, TreeEntry | None]] = [(entry1, entry2)] while todo: entry1, entry2 = todo.pop() is_tree1 = _is_tree(entry1) is_tree2 = _is_tree(entry2) if prune_identical and is_tree1 and is_tree2 and entry1 == entry2: continue tree1 = (is_tree1 and entry1 and store[entry1.sha]) or None tree2 = (is_tree2 and entry2 and store[entry2.sha]) or None path = ( (entry1.path if entry1 else None) or (entry2.path if entry2 else None) or b"" ) # If we have path filters, check if we should process this tree if paths is not None and (is_tree1 or is_tree2) and path is not None: # Special case for root tree if path == b"": should_recurse = True else: # Check if any of our filter paths could be under this tree should_recurse = False for filter_path in paths: if filter_path == path: # Exact match - we want this directory itself should_recurse = True break elif filter_path.startswith(path + b"/"): # Filter path is under this directory should_recurse = True break elif path.startswith(filter_path + b"/"): # This directory is under a filter path should_recurse = True break if not should_recurse: # Skip this tree entirely continue # Ensure trees are Tree objects before merging if tree1 is not None and not isinstance(tree1, Tree): tree1 = None if tree2 is not None and not isinstance(tree2, Tree): tree2 = None if tree1 is not None or tree2 is not None: # Use empty trees for None values if tree1 is None: tree1 = Tree() if tree2 is None: tree2 = Tree() assert path is not None todo.extend(reversed(_merge_entries(path, tree1, tree2))) # Only yield entries that match our path filters if paths is None: yield entry1, entry2 else: # Check if this entry matches any of our filters for filter_path in paths: if path == filter_path: # Exact match yield entry1, entry2 break elif path is not None and path.startswith(filter_path + b"/"): # This entry is under a filter directory yield entry1, entry2 break elif ( path is not None and filter_path.startswith(path + b"/") and (is_tree1 or is_tree2) ): # This is a parent directory of a filter path yield entry1, entry2 break def _skip_tree(entry: TreeEntry | None, include_trees: bool) -> TreeEntry | None: if entry is None or entry.mode is None: return None if not include_trees and stat.S_ISDIR(entry.mode): return None return entry def tree_changes( store: BaseObjectStore, tree1_id: ObjectID | None, tree2_id: ObjectID | None, want_unchanged: bool = False, rename_detector: "RenameDetector | None" = None, include_trees: bool = False, change_type_same: bool = False, paths: Sequence[bytes] | None = None, ) -> Iterator[TreeChange]: """Find the differences between the contents of two trees. Args: store: An ObjectStore for looking up objects. tree1_id: The SHA of the source tree. tree2_id: The SHA of the target tree. want_unchanged: If True, include TreeChanges for unmodified entries as well. include_trees: Whether to include trees rename_detector: RenameDetector object for detecting renames. change_type_same: Whether to report change types in the same entry or as delete+add. paths: Optional list of paths to filter to (as bytes). Returns: Iterator over TreeChange instances for each change between the source and target tree. """ if rename_detector is not None and tree1_id is not None and tree2_id is not None: yield from rename_detector.changes_with_renames( tree1_id, tree2_id, want_unchanged=want_unchanged, include_trees=include_trees, ) return entries = walk_trees( store, tree1_id, tree2_id, prune_identical=(not want_unchanged), paths=paths ) for entry1, entry2 in entries: if entry1 == entry2 and not want_unchanged: continue # Treat entries for trees as missing. entry1 = _skip_tree(entry1, include_trees) entry2 = _skip_tree(entry2, include_trees) if entry1 is not None and entry2 is not None: if ( entry1.mode is not None and entry2.mode is not None and stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode) and not change_type_same ): # File type changed: report as delete/add. yield TreeChange.delete(entry1) entry1 = None change_type = CHANGE_ADD elif entry1 == entry2: change_type = CHANGE_UNCHANGED else: change_type = CHANGE_MODIFY elif entry1 is not None: change_type = CHANGE_DELETE elif entry2 is not None: change_type = CHANGE_ADD else: # Both were None because at least one was a tree. continue yield TreeChange(change_type, entry1, entry2) T = TypeVar("T") U = TypeVar("U") def _all_eq(seq: Sequence[T], key: Callable[[T], U], value: U) -> bool: for e in seq: if key(e) != value: return False return True def _all_same(seq: Sequence[Any], key: Callable[[Any], Any]) -> bool: return _all_eq(seq[1:], key, key(seq[0])) def tree_changes_for_merge( store: BaseObjectStore, parent_tree_ids: Sequence[ObjectID], tree_id: ObjectID, rename_detector: "RenameDetector | None" = None, ) -> Iterator[list[TreeChange | None]]: """Get the tree changes for a merge tree relative to all its parents. Args: store: An ObjectStore for looking up objects. parent_tree_ids: An iterable of the SHAs of the parent trees. tree_id: The SHA of the merge tree. rename_detector: RenameDetector object for detecting renames. Returns: Iterator over lists of TreeChange objects, one per conflicted path in the merge. Each list contains one element per parent, with the TreeChange for that path relative to that parent. An element may be None if it never existed in one parent and was deleted in two others. A path is only included in the output if it is a conflict, i.e. its SHA in the merge tree is not found in any of the parents, or in the case of deletes, if not all of the old SHAs match. """ all_parent_changes = [ tree_changes(store, t, tree_id, rename_detector=rename_detector) for t in parent_tree_ids ] num_parents = len(parent_tree_ids) changes_by_path: dict[bytes, list[TreeChange | None]] = defaultdict( lambda: [None] * num_parents ) # Organize by path. for i, parent_changes in enumerate(all_parent_changes): for change in parent_changes: if change.type == CHANGE_DELETE: assert change.old is not None path = change.old.path else: assert change.new is not None path = change.new.path assert path is not None changes_by_path[path][i] = change def old_sha(c: TreeChange) -> ObjectID | None: return c.old.sha if c.old is not None else None def change_type(c: TreeChange) -> str: return c.type # Yield only conflicting changes. for _, changes in sorted(changes_by_path.items()): assert len(changes) == num_parents have = [c for c in changes if c is not None] if _all_eq(have, change_type, CHANGE_DELETE): if not _all_same(have, old_sha): yield changes elif not _all_same(have, change_type): yield changes elif None not in changes: # If no change was found relative to one parent, that means the SHA # must have matched the SHA in that parent, so it is not a # conflict. yield changes _BLOCK_SIZE = 64 def _count_blocks(obj: ShaFile) -> dict[int, int]: """Count the blocks in an object. Splits the data into blocks either on lines or <=64-byte chunks of lines. Args: obj: The object to count blocks for. Returns: A dict of block hashcode -> total bytes occurring. """ block_counts: dict[int, int] = defaultdict(int) block = BytesIO() n = 0 # Cache attrs as locals to avoid expensive lookups in the inner loop. block_write = block.write block_seek = block.seek block_truncate = block.truncate block_getvalue = block.getvalue for c in chain.from_iterable(obj.as_raw_chunks()): cb = c.to_bytes(1, "big") block_write(cb) n += 1 if cb == b"\n" or n == _BLOCK_SIZE: value = block_getvalue() block_counts[hash(value)] += len(value) block_seek(0) block_truncate() n = 0 if n > 0: last_block = block_getvalue() block_counts[hash(last_block)] += len(last_block) return block_counts def _common_bytes(blocks1: Mapping[int, int], blocks2: Mapping[int, int]) -> int: """Count the number of common bytes in two block count dicts. Args: blocks1: The first dict of block hashcode -> total bytes. blocks2: The second dict of block hashcode -> total bytes. Returns: The number of bytes in common between blocks1 and blocks2. This is only approximate due to possible hash collisions. """ # Iterate over the smaller of the two dicts, since this is symmetrical. if len(blocks1) > len(blocks2): blocks1, blocks2 = blocks2, blocks1 score = 0 for block, count1 in blocks1.items(): count2 = blocks2.get(block) if count2: score += min(count1, count2) return score def _similarity_score( obj1: ShaFile, obj2: ShaFile, block_cache: dict[ObjectID, dict[int, int]] | None = None, ) -> int: """Compute a similarity score for two objects. Args: obj1: The first object to score. obj2: The second object to score. block_cache: An optional dict of SHA to block counts to cache results between calls. Returns: The similarity score between the two objects, defined as the number of bytes in common between the two objects divided by the maximum size, scaled to the range 0-100. """ if block_cache is None: block_cache = {} if obj1.id not in block_cache: block_cache[obj1.id] = _count_blocks(obj1) if obj2.id not in block_cache: block_cache[obj2.id] = _count_blocks(obj2) common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id]) max_size = max(obj1.raw_length(), obj2.raw_length()) if not max_size: return _MAX_SCORE return int(float(common_bytes) * _MAX_SCORE / max_size) def _tree_change_key(entry: TreeChange) -> tuple[bytes, bytes]: # Sort by old path then new path. If only one exists, use it for both keys. path1 = entry.old.path if entry.old is not None else None path2 = entry.new.path if entry.new is not None else None if path1 is None: path1 = path2 if path2 is None: path2 = path1 assert path1 is not None assert path2 is not None return (path1, path2) class RenameDetector: """Object for handling rename detection between two trees.""" _adds: list[TreeChange] _deletes: list[TreeChange] _changes: list[TreeChange] _candidates: list[tuple[int, TreeChange]] def __init__( self, store: BaseObjectStore, rename_threshold: int = RENAME_THRESHOLD, max_files: int | None = MAX_FILES, rewrite_threshold: int | None = REWRITE_THRESHOLD, find_copies_harder: bool = False, ) -> None: """Initialize the rename detector. Args: store: An ObjectStore for looking up objects. rename_threshold: The threshold similarity score for considering an add/delete pair to be a rename/copy; see _similarity_score. max_files: The maximum number of adds and deletes to consider, or None for no limit. The detector is guaranteed to compare no more than max_files ** 2 add/delete pairs. This limit is provided because rename detection can be quadratic in the project size. If the limit is exceeded, no content rename detection is attempted. rewrite_threshold: The threshold similarity score below which a modify should be considered a delete/add, or None to not break modifies; see _similarity_score. find_copies_harder: If True, consider unmodified files when detecting copies. """ self._store = store self._rename_threshold = rename_threshold self._rewrite_threshold = rewrite_threshold self._max_files = max_files self._find_copies_harder = find_copies_harder self._want_unchanged = False def _reset(self) -> None: self._adds = [] self._deletes = [] self._changes = [] def _should_split(self, change: TreeChange) -> bool: if self._rewrite_threshold is None or change.type != CHANGE_MODIFY: return False assert change.old is not None and change.new is not None if change.old.sha == change.new.sha: return False assert change.old.sha is not None assert change.new.sha is not None old_obj = self._store[change.old.sha] new_obj = self._store[change.new.sha] return _similarity_score(old_obj, new_obj) < self._rewrite_threshold def _add_change(self, change: TreeChange) -> None: if change.type == CHANGE_ADD: self._adds.append(change) elif change.type == CHANGE_DELETE: self._deletes.append(change) elif self._should_split(change): assert change.old is not None and change.new is not None self._deletes.append(TreeChange.delete(change.old)) self._adds.append(TreeChange.add(change.new)) elif ( self._find_copies_harder and change.type == CHANGE_UNCHANGED ) or change.type == CHANGE_MODIFY: # Treat all modifies as potential deletes for rename detection, # but don't split them (to avoid spurious renames). Setting # find_copies_harder means we treat unchanged the same as # modified. self._deletes.append(change) else: self._changes.append(change) def _collect_changes( self, tree1_id: ObjectID | None, tree2_id: ObjectID | None ) -> None: want_unchanged = self._find_copies_harder or self._want_unchanged for change in tree_changes( self._store, tree1_id, tree2_id, want_unchanged=want_unchanged, include_trees=self._include_trees, ): self._add_change(change) def _prune( self, add_paths: AbstractSet[bytes], delete_paths: AbstractSet[bytes] ) -> None: def check_add(a: TreeChange) -> bool: assert a.new is not None return a.new.path not in add_paths def check_delete(d: TreeChange) -> bool: assert d.old is not None return d.old.path not in delete_paths self._adds = [a for a in self._adds if check_add(a)] self._deletes = [d for d in self._deletes if check_delete(d)] def _find_exact_renames(self) -> None: add_map = defaultdict(list) for add in self._adds: assert add.new is not None add_map[add.new.sha].append(add.new) delete_map = defaultdict(list) for delete in self._deletes: # Keep track of whether the delete was actually marked as a delete. # If not, it needs to be marked as a copy. is_delete = delete.type == CHANGE_DELETE assert delete.old is not None delete_map[delete.old.sha].append((delete.old, is_delete)) add_paths = set() delete_paths = set() for sha, sha_deletes in delete_map.items(): sha_adds = add_map[sha] for (old, is_delete), new in zip(sha_deletes, sha_adds): assert old.mode is not None assert new.mode is not None if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode): continue if is_delete: assert old.path is not None delete_paths.add(old.path) assert new.path is not None add_paths.add(new.path) new_type = (is_delete and CHANGE_RENAME) or CHANGE_COPY self._changes.append(TreeChange(new_type, old, new)) num_extra_adds = len(sha_adds) - len(sha_deletes) # TODO(dborowitz): Less arbitrary way of dealing with extra copies. old = sha_deletes[0][0] if num_extra_adds > 0: for new in sha_adds[-num_extra_adds:]: assert new.path is not None add_paths.add(new.path) self._changes.append(TreeChange(CHANGE_COPY, old, new)) self._prune(add_paths, delete_paths) def _should_find_content_renames(self) -> bool: if self._max_files is None: return True return len(self._adds) * len(self._deletes) <= self._max_files**2 def _rename_type( self, check_paths: bool, delete: TreeChange, add: TreeChange ) -> str: assert delete.old is not None and add.new is not None if check_paths and delete.old.path == add.new.path: # If the paths match, this must be a split modify, so make sure it # comes out as a modify. return CHANGE_MODIFY elif delete.type != CHANGE_DELETE: # If it's in deletes but not marked as a delete, it must have been # added due to find_copies_harder, and needs to be marked as a # copy. return CHANGE_COPY return CHANGE_RENAME def _find_content_rename_candidates(self) -> None: candidates = self._candidates = [] # TODO: Optimizations: # - Compare object sizes before counting blocks. # - Skip if delete's S_IFMT differs from all adds. # - Skip if adds or deletes is empty. # Match C git's behavior of not attempting to find content renames if # the matrix size exceeds the threshold. if not self._should_find_content_renames(): return block_cache = {} check_paths = self._rename_threshold is not None for delete in self._deletes: assert delete.old is not None assert delete.old.mode is not None if S_ISGITLINK(delete.old.mode): continue # Git links don't exist in this repo. assert delete.old.sha is not None old_sha = delete.old.sha old_obj = self._store[old_sha] block_cache[old_sha] = _count_blocks(old_obj) for add in self._adds: assert add.new is not None assert add.new.mode is not None if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode): continue assert add.new.sha is not None new_obj = self._store[add.new.sha] score = _similarity_score(old_obj, new_obj, block_cache=block_cache) if score > self._rename_threshold: new_type = self._rename_type(check_paths, delete, add) rename = TreeChange(new_type, delete.old, add.new) candidates.append((-score, rename)) def _choose_content_renames(self) -> None: # Sort scores from highest to lowest, but keep names in ascending # order. self._candidates.sort() delete_paths = set() add_paths = set() for _, change in self._candidates: assert change.old is not None and change.new is not None new_path = change.new.path assert new_path is not None if new_path in add_paths: continue old_path = change.old.path assert old_path is not None orig_type = change.type if old_path in delete_paths: change = TreeChange(CHANGE_COPY, change.old, change.new) # If the candidate was originally a copy, that means it came from a # modified or unchanged path, so we don't want to prune it. if orig_type != CHANGE_COPY: delete_paths.add(old_path) add_paths.add(new_path) self._changes.append(change) self._prune(add_paths, delete_paths) def _join_modifies(self) -> None: if self._rewrite_threshold is None: return modifies = {} delete_map = {} for d in self._deletes: assert d.old is not None delete_map[d.old.path] = d for add in self._adds: assert add.new is not None path = add.new.path delete = delete_map.get(path) if ( delete is not None and delete.old is not None and delete.old.mode is not None and add.new.mode is not None and stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode) ): modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new) def check_add_mod(a: TreeChange) -> bool: assert a.new is not None return a.new.path not in modifies def check_delete_mod(d: TreeChange) -> bool: assert d.old is not None return d.old.path not in modifies self._adds = [a for a in self._adds if check_add_mod(a)] self._deletes = [d for d in self._deletes if check_delete_mod(d)] self._changes += modifies.values() def _sorted_changes(self) -> list[TreeChange]: result = [] result.extend(self._adds) result.extend(self._deletes) result.extend(self._changes) result.sort(key=_tree_change_key) return result def _prune_unchanged(self) -> None: if self._want_unchanged: return self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED] def changes_with_renames( self, tree1_id: ObjectID | None, tree2_id: ObjectID | None, want_unchanged: bool = False, include_trees: bool = False, ) -> list[TreeChange]: """Iterate TreeChanges between two tree SHAs, with rename detection.""" self._reset() self._want_unchanged = want_unchanged self._include_trees = include_trees self._collect_changes(tree1_id, tree2_id) self._find_exact_renames() self._find_content_rename_candidates() self._choose_content_renames() self._join_modifies() self._prune_unchanged() return self._sorted_changes() # Hold on to the pure-python implementations for testing. _is_tree_py = _is_tree _merge_entries_py = _merge_entries _count_blocks_py = _count_blocks if TYPE_CHECKING: # For type checking, use the Python implementations pass else: # At runtime, try to import Rust extensions try: # Try to import Rust versions from dulwich._diff_tree import ( _count_blocks as _rust_count_blocks, ) from dulwich._diff_tree import ( _is_tree as _rust_is_tree, ) from dulwich._diff_tree import ( _merge_entries as _rust_merge_entries, ) # Override with Rust versions _count_blocks = _rust_count_blocks _is_tree = _rust_is_tree _merge_entries = _rust_merge_entries except ImportError: pass dulwich-1.0.0/dulwich/diffstat.py000077500000000000000000000336261513301442600170000ustar00rootroot00000000000000#!/usr/bin/env python # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # SPDX-License-Identifier: MIT # Copyright (c) 2020 Kevin B. Hendricks, Stratford Ontario Canada # All rights reserved. # # This diffstat code was extracted and heavily modified from: # # https://github.com/techtonik/python-patch # Under the following license: # # Patch utility to apply unified diffs # Brute-force line-by-line non-recursive parsing # # Copyright (c) 2008-2016 anatoly techtonik # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. """Generate diff statistics similar to git's --stat option. This module provides functionality to parse unified diff output and generate statistics about changes, including: - Number of lines added and removed per file - Binary file detection - File rename detection - Formatted output similar to git diff --stat """ __all__ = [ "diffstat", "main", "parse_patch", ] import re import sys from collections.abc import Sequence # only needs to detect git style diffs as this is for # use with dulwich _git_header_name = re.compile(rb"diff --git a/(.*) b/(.*)") _GIT_HEADER_START = b"diff --git a/" _GIT_BINARY_START = b"Binary file" _GIT_RENAMEFROM_START = b"rename from" _GIT_RENAMETO_START = b"rename to" _GIT_CHUNK_START = b"@@" _GIT_ADDED_START = b"+" _GIT_DELETED_START = b"-" _GIT_UNCHANGED_START = b" " # emulate original full Patch class by just extracting # filename and minimal chunk added/deleted information to # properly interface with diffstat routine def parse_patch( lines: Sequence[bytes], ) -> tuple[list[bytes], list[bool], list[tuple[int, int]]]: """Parse a git style diff or patch to generate diff stats. Args: lines: list of byte string lines from the diff to be parsed Returns: A tuple (names, is_binary, counts) of three lists """ names = [] nametypes = [] counts = [] in_patch_chunk = in_git_header = binaryfile = False currentfile: bytes | None = None added = deleted = 0 for line in lines: if line.startswith(_GIT_HEADER_START): if currentfile is not None: names.append(currentfile) nametypes.append(binaryfile) counts.append((added, deleted)) m = _git_header_name.search(line) assert m currentfile = m.group(2) binaryfile = False added = deleted = 0 in_git_header = True in_patch_chunk = False elif line.startswith(_GIT_BINARY_START) and in_git_header: binaryfile = True in_git_header = False elif line.startswith(_GIT_RENAMEFROM_START) and in_git_header: currentfile = line[12:] elif line.startswith(_GIT_RENAMETO_START) and in_git_header: assert currentfile currentfile += b" => %s" % line[10:] elif line.startswith(_GIT_CHUNK_START) and (in_patch_chunk or in_git_header): in_patch_chunk = True in_git_header = False elif line.startswith(_GIT_ADDED_START) and in_patch_chunk: added += 1 elif line.startswith(_GIT_DELETED_START) and in_patch_chunk: deleted += 1 elif not line.startswith(_GIT_UNCHANGED_START) and in_patch_chunk: in_patch_chunk = False # handle end of input if currentfile is not None: names.append(currentfile) nametypes.append(binaryfile) counts.append((added, deleted)) return names, nametypes, counts # note must all done using bytes not string because on linux filenames # may not be encodable even to utf-8 def diffstat(lines: Sequence[bytes], max_width: int = 80) -> bytes: """Generate summary statistics from a git style diff ala (git diff tag1 tag2 --stat). Args: lines: list of byte string "lines" from the diff to be parsed max_width: maximum line length for generating the summary statistics (default 80) Returns: A byte string that lists the changed files with change counts and histogram. """ names, nametypes, counts = parse_patch(lines) insert = [] delete = [] namelen = 0 maxdiff = 0 # max changes for any file used for histogram width calc for i, filename in enumerate(names): i, d = counts[i] insert.append(i) delete.append(d) namelen = max(namelen, len(filename)) maxdiff = max(maxdiff, i + d) output = b"" statlen = len(str(maxdiff)) # stats column width for i, n in enumerate(names): binaryfile = nametypes[i] # %-19s | %-4d %s # note b'%d' % namelen is not supported until Python 3.5 # To convert an int to a format width specifier for byte # strings use str(namelen).encode('ascii') format = ( b" %-" + str(namelen).encode("ascii") + b"s | %" + str(statlen).encode("ascii") + b"s %s\n" ) binformat = b" %-" + str(namelen).encode("ascii") + b"s | %s\n" if not binaryfile: hist = b"" # -- calculating histogram -- width = len(format % (b"", b"", b"")) histwidth = max(2, max_width - width) if maxdiff < histwidth: hist = b"+" * insert[i] + b"-" * delete[i] else: iratio = (float(insert[i]) / maxdiff) * histwidth dratio = (float(delete[i]) / maxdiff) * histwidth iwidth = dwidth = 0 # make sure every entry that had actual insertions gets # at least one + if insert[i] > 0: iwidth = int(iratio) if iwidth == 0 and 0 < iratio < 1: iwidth = 1 # make sure every entry that had actual deletions gets # at least one - if delete[i] > 0: dwidth = int(dratio) if dwidth == 0 and 0 < dratio < 1: dwidth = 1 hist = b"+" * int(iwidth) + b"-" * int(dwidth) output += format % ( bytes(names[i]), str(insert[i] + delete[i]).encode("ascii"), hist, ) else: output += binformat % (bytes(names[i]), b"Bin") output += b" %d files changed, %d insertions(+), %d deletions(-)" % ( len(names), sum(insert), sum(delete), ) return output def main() -> int: """Main entry point for diffstat command line tool. Returns: Exit code (0 for success) """ argv = sys.argv # allow diffstat.py to also be used from the command line if len(sys.argv) > 1: diffpath = argv[1] data = b"" with open(diffpath, "rb") as f: data = f.read() lines = data.split(b"\n") result = diffstat(lines) print(result.decode("utf-8")) return 0 # if no path argument to a diff file is passed in, run # a self test. The test case includes tricky things like # a diff of diff, binary files, renames with further changes # added files and removed files. # All extracted from Sigil-Ebook/Sigil's github repo with # full permission to use under this license. selftest = b""" diff --git a/docs/qt512.7_remove_bad_workaround.patch b/docs/qt512.7_remove_bad_workaround.patch new file mode 100644 index 00000000..64e34192 --- /dev/null +++ b/docs/qt512.7_remove_bad_workaround.patch @@ -0,0 +1,15 @@ +--- qtbase/src/gui/kernel/qwindow.cpp.orig 2019-12-12 09:15:59.000000000 -0500 ++++ qtbase/src/gui/kernel/qwindow.cpp 2020-01-10 10:36:53.000000000 -0500 +@@ -218,12 +218,6 @@ + QGuiApplicationPrivate::window_list.removeAll(this); + if (!QGuiApplicationPrivate::is_app_closing) + QGuiApplicationPrivate::instance()->modalWindowList.removeOne(this); +- +- // focus_window is normally cleared in destroy(), but the window may in +- // some cases end up becoming the focus window again. Clear it again +- // here as a workaround. See QTBUG-75326. +- if (QGuiApplicationPrivate::focus_window == this) +- QGuiApplicationPrivate::focus_window = 0; + } + + void QWindowPrivate::init(QScreen *targetScreen) diff --git a/docs/testplugin_v017.zip b/docs/testplugin_v017.zip new file mode 100644 index 00000000..a4cf4c4c Binary files /dev/null and b/docs/testplugin_v017.zip differ diff --git a/ci_scripts/macgddeploy.py b/ci_scripts/gddeploy.py similarity index 73% rename from ci_scripts/macgddeploy.py rename to ci_scripts/gddeploy.py index a512d075..f9dacd33 100644 --- a/ci_scripts/macgddeploy.py +++ b/ci_scripts/gddeploy.py @@ -1,19 +1,32 @@ #!/usr/bin/env python3 import os +import sys import subprocess import datetime import shutil +import glob gparent = os.path.expandvars('$GDRIVE_DIR') grefresh_token = os.path.expandvars('$GDRIVE_REFRESH_TOKEN') -travis_branch = os.path.expandvars('$TRAVIS_BRANCH') -travis_commit = os.path.expandvars('$TRAVIS_COMMIT') -travis_build_number = os.path.expandvars('$TRAVIS_BUILD_NUMBER') +if sys.platform.lower().startswith('darwin'): + travis_branch = os.path.expandvars('$TRAVIS_BRANCH') + travis_commit = os.path.expandvars('$TRAVIS_COMMIT') + travis_build_number = os.path.expandvars('$TRAVIS_BUILD_NUMBER') + + origfilename = './bin/Sigil.tar.xz' + newfilename = './bin/Sigil-{}-{}-build_num-{}.tar.xz'.format(travis_branch, travis_commit[:7],travis_build_numbe\ r) +else: + appveyor_branch = os.path.expandvars('$APPVEYOR_REPO_BRANCH') + appveyor_commit = os.path.expandvars('$APPVEYOR_REPO_COMMIT') + appveyor_build_number = os.path.expandvars('$APPVEYOR_BUILD_NUMBER') + names = glob.glob('.\\installer\\Sigil-*-Setup.exe') + if not names: + exit(1) + origfilename = names[0] + newfilename = '.\\installer\\Sigil-{}-{}-build_num-{}-Setup.exe'.format(appveyor_branch, appveyor_commit[:7], ap\ pveyor_build_number) -origfilename = './bin/Sigil.tar.xz' -newfilename = './bin/Sigil-{}-{}-build_num-{}.tar.xz'.format(travis_branch, travis_commit[:7],travis_build_number) shutil.copy2(origfilename, newfilename) folder_name = datetime.date.today() diff --git a/docs/qt512.6_backport_009abcd_fix.patch b/docs/qt512.6_backport_009abcd_fix.patch deleted file mode 100644 index f4724347..00000000 --- a/docs/qt512.6_backport_009abcd_fix.patch +++ /dev/null @@ -1,26 +0,0 @@ ---- qtbase/src/widgets/kernel/qwidget.cpp.orig 2019-11-08 10:57:07.000000000 -0500 -+++ qtbase/src/widgets/kernel/qwidget.cpp 2019-12-11 12:32:24.000000000 -0500 -@@ -8934,6 +8934,23 @@ - } - } - switch (event->type()) { -+ case QEvent::PlatformSurface: { -+ // Sync up QWidget's view of whether or not the widget has been created -+ switch (static_cast(event)->surfaceEventType()) { -+ case QPlatformSurfaceEvent::SurfaceCreated: -+ if (!testAttribute(Qt::WA_WState_Created)) -+ create(); -+ break; -+ case QPlatformSurfaceEvent::SurfaceAboutToBeDestroyed: -+ if (testAttribute(Qt::WA_WState_Created)) { -+ // Child windows have already been destroyed by QWindow, -+ // so we skip them here. -+ destroy(false, false); -+ } -+ break; -+ } -+ break; -+ } - case QEvent::MouseMove: - mouseMoveEvent((QMouseEvent*)event); - break; diff --git a/docs/Building_Sigil_On_MacOSX.txt b/docs/Building_Sigil_On_MacOSX.txt index 3b41fd80..64914c78 100644 --- a/docs/Building_Sigil_On_MacOSX.txt +++ b/docs/Building_Sigil_On_MacOSX.txt @@ -113,7 +113,7 @@ install_name_tool -add_rpath @loader_path/../../Frameworks ./bin/Sigil.app/Content # To test if the newly bundled python 3 version of Sigil is working properly ypou can do the following: -1. download testplugin_v014.zip from https://github.com/Sigil-Ebook/Sigil/tree/master/docs +1. download testplugin_v017.zip from https://github.com/Sigil-Ebook/Sigil/tree/master/docs 2. open Sigil.app to the normal nearly blank template epub it generates when opened 3. use Plugins->Manage Plugins menu and make sure the "Use Bundled Python" checkbox is checked 4. use the "Add Plugin" button to navigate to and add testplugin.zip and then hit "Okay" to exit the Manage Plugins Dialog """ testoutput = b""" docs/qt512.7_remove_bad_workaround.patch | 15 ++++++++++++ docs/testplugin_v017.zip | Bin ci_scripts/macgddeploy.py => ci_scripts/gddeploy.py | 0 docs/qt512.6_backport_009abcd_fix.patch | 26 --------------------- docs/Building_Sigil_On_MacOSX.txt | 2 +- 5 files changed, 16 insertions(+), 27 deletions(-)""" # return 0 on success otherwise return -1 result = diffstat(selftest.split(b"\n")) if result == testoutput: print("self test passed") return 0 print("self test failed") print("Received:") print(result.decode("utf-8")) print("Expected:") print(testoutput.decode("utf-8")) return -1 if __name__ == "__main__": sys.exit(main()) dulwich-1.0.0/dulwich/dumb.py000066400000000000000000000441631513301442600161160ustar00rootroot00000000000000# dumb.py -- Support for dumb HTTP(S) git repositories # Copyright (C) 2025 Dulwich contributors # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Support for dumb HTTP(S) git repositories.""" __all__ = [ "DumbHTTPObjectStore", "DumbRemoteHTTPRepo", ] import os import tempfile import zlib from collections.abc import Callable, Iterator, Mapping, Sequence from io import BytesIO from typing import TYPE_CHECKING, Any from urllib.parse import urljoin if TYPE_CHECKING: from .object_format import ObjectFormat from .errors import NotGitRepository, ObjectFormatException from .object_store import BaseObjectStore from .objects import ( ZERO_SHA, Blob, Commit, ObjectID, RawObjectID, ShaFile, Tag, Tree, hex_to_sha, sha_to_hex, ) from .pack import Pack, PackData, PackIndex, UnpackedObject, load_pack_index_file from .protocol import split_peeled_refs from .refs import Ref, read_info_refs class DumbHTTPObjectStore(BaseObjectStore): """Object store implementation that fetches objects over dumb HTTP.""" def __init__( self, base_url: str, http_request_func: Callable[ [str, dict[str, str]], tuple[Any, Callable[..., bytes]] ], object_format: "ObjectFormat | None" = None, ) -> None: """Initialize a DumbHTTPObjectStore. Args: base_url: Base URL of the remote repository (e.g. "https://example.com/repo.git/") http_request_func: Function to make HTTP requests, should accept (url, headers) and return (response, read_func). object_format: Object format to use (defaults to DEFAULT_OBJECT_FORMAT) """ super().__init__(object_format=object_format) self.base_url = base_url.rstrip("/") + "/" self._http_request = http_request_func self._packs: list[tuple[str, PackIndex | None]] | None = None self._cached_objects: dict[bytes, tuple[int, bytes]] = {} self._temp_pack_dir: str | None = None def _ensure_temp_pack_dir(self) -> None: """Ensure we have a temporary directory for storing pack files.""" if self._temp_pack_dir is None: self._temp_pack_dir = tempfile.mkdtemp(prefix="dulwich-dumb-") def _fetch_url(self, path: str) -> bytes: """Fetch content from a URL path relative to base_url. Args: path: Path relative to base URL Returns: Content as bytes Raises: IOError: If the URL cannot be fetched """ url = urljoin(self.base_url, path) resp, read = self._http_request(url, {}) try: if resp.status == 404: raise OSError(f"Not found: {url}") elif resp.status != 200: raise OSError(f"HTTP error {resp.status}: {url}") # Read all content chunks = [] while True: chunk = read(4096) if not chunk: break chunks.append(chunk) return b"".join(chunks) finally: resp.close() def _fetch_loose_object(self, sha: bytes) -> tuple[int, bytes]: """Fetch a loose object by SHA. Args: sha: SHA1 of the object (hex string as bytes) Returns: Tuple of (type_num, content) Raises: KeyError: If object not found """ hex_sha = sha.decode("ascii") path = f"objects/{hex_sha[:2]}/{hex_sha[2:]}" try: compressed = self._fetch_url(path) except OSError: raise KeyError(sha) # Decompress and parse the object decompressed = zlib.decompress(compressed) # Parse header header_end = decompressed.find(b"\x00") if header_end == -1: raise ObjectFormatException("Invalid object header") header = decompressed[:header_end] content = decompressed[header_end + 1 :] parts = header.split(b" ", 1) if len(parts) != 2: raise ObjectFormatException("Invalid object header") obj_type = parts[0] obj_size = int(parts[1]) if len(content) != obj_size: raise ObjectFormatException("Object size mismatch") # Convert type name to type number type_map = { b"blob": Blob.type_num, b"tree": Tree.type_num, b"commit": Commit.type_num, b"tag": Tag.type_num, } if obj_type not in type_map: raise ObjectFormatException(f"Unknown object type: {obj_type!r}") return type_map[obj_type], content def _load_packs(self) -> None: """Load the list of available packs from the remote.""" if self._packs is not None: return self._packs = [] try: packs_data = self._fetch_url("objects/info/packs") except OSError: # No packs file, repository might only have loose objects return for line in packs_data.strip().split(b"\n"): if line.startswith(b"P "): pack_name = line[2:].decode("utf-8") # Extract just the pack name without path if "/" in pack_name: pack_name = pack_name.split("/")[-1] if pack_name.endswith(".pack"): pack_name = pack_name[:-5] # Remove .pack extension self._packs.append((pack_name, None)) def _get_pack_index(self, pack_name: str) -> PackIndex: """Get or fetch a pack index. Args: pack_name: Name of the pack (without .idx extension) Returns: PackIndex object """ # Find the pack in our list for i, (name, idx) in enumerate(self._packs or []): if name == pack_name: if idx is None: # Fetch and cache the index idx_data = self._fetch_url(f"objects/pack/{pack_name}.idx") idx = load_pack_index_file( "", BytesIO(idx_data), self.object_format ) if self._packs is not None: self._packs[i] = (name, idx) return idx raise KeyError(f"Pack not found: {pack_name}") def _fetch_from_pack(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]: """Try to fetch an object from pack files. Args: sha: SHA1 of the object (hex string as bytes) Returns: Tuple of (type_num, content) Raises: KeyError: If object not found in any pack """ self._load_packs() # Convert hex to binary for pack operations if len(sha) == 20: binsha = RawObjectID(sha) # Already binary else: binsha = hex_to_sha(ObjectID(sha)) # Convert hex to binary for pack_name, pack_idx in self._packs or []: if pack_idx is None: pack_idx = self._get_pack_index(pack_name) try: # Check if object is in this pack pack_idx.object_offset(binsha) except KeyError: continue # We found the object, now we need to fetch the pack data # For efficiency, we could fetch just the needed portion, but for # simplicity we'll fetch the whole pack and cache it self._ensure_temp_pack_dir() if self._temp_pack_dir is None: raise RuntimeError("Temp pack directory not initialized") pack_path = os.path.join(self._temp_pack_dir, f"{pack_name}.pack") if not os.path.exists(pack_path): # Download the pack file data = self._fetch_url(f"objects/pack/{pack_name}.pack") with open(pack_path, "wb") as f: f.write(data) # Open the pack and get the object pack_data = PackData(pack_path, object_format=self.object_format) pack = Pack.from_objects(pack_data, pack_idx) try: return pack.get_raw(binsha) finally: pack.close() raise KeyError(sha) def get_raw(self, sha: RawObjectID | ObjectID) -> tuple[int, bytes]: """Obtain the raw text for an object. Args: sha: SHA1 of the object Returns: Tuple with numeric type and object contents """ # Check cache first if sha in self._cached_objects: return self._cached_objects[sha] # Try packs first try: result = self._fetch_from_pack(sha) self._cached_objects[sha] = result return result except KeyError: pass # Try loose object result = self._fetch_loose_object(sha) self._cached_objects[sha] = result return result def contains_loose(self, sha: RawObjectID | ObjectID) -> bool: """Check if a particular object is present by SHA1 and is loose.""" try: self._fetch_loose_object(sha) return True except KeyError: return False def __contains__(self, sha: RawObjectID | ObjectID) -> bool: """Check if a particular object is present by SHA1.""" if sha in self._cached_objects: return True # Try packs try: self._fetch_from_pack(sha) return True except KeyError: pass # Try loose object try: self._fetch_loose_object(sha) return True except KeyError: return False def __iter__(self) -> Iterator[ObjectID]: """Iterate over all SHAs in the store. Note: This is inefficient for dumb HTTP as it requires downloading all pack indices. """ seen = set() # We can't efficiently list loose objects over dumb HTTP # So we only iterate pack objects self._load_packs() for pack_name, idx in self._packs or []: if idx is None: idx = self._get_pack_index(pack_name) for sha in idx: if sha not in seen: seen.add(sha) yield sha_to_hex(RawObjectID(sha)) @property def packs(self) -> list[Any]: """Iterable of pack objects. Note: Returns empty list as we don't have actual Pack objects. """ return [] def add_object(self, obj: ShaFile) -> None: """Add a single object to this object store.""" raise NotImplementedError("Cannot add objects to dumb HTTP repository") def add_objects( self, objects: Sequence[tuple[ShaFile, str | None]], progress: Callable[[str], None] | None = None, ) -> "Pack | None": """Add a set of objects to this object store.""" raise NotImplementedError("Cannot add objects to dumb HTTP repository") def close(self) -> None: """Close the object store and release resources. This method cleans up the temporary pack directory. Can be called multiple times safely. """ if self._temp_pack_dir is not None: if os.path.exists(self._temp_pack_dir): import shutil shutil.rmtree(self._temp_pack_dir, ignore_errors=True) self._temp_pack_dir = None def __del__(self) -> None: """Warn if the object store is being deleted without closing.""" if self._temp_pack_dir is not None: import warnings warnings.warn( f"DumbHTTPObjectStore {self!r} was destroyed without calling close(). " "Temporary pack directory may not be cleaned up properly.", ResourceWarning, stacklevel=2, ) self.close() class DumbRemoteHTTPRepo: """Repository implementation for dumb HTTP remotes.""" def __init__( self, base_url: str, http_request_func: Callable[ [str, dict[str, str]], tuple[Any, Callable[..., bytes]] ], ) -> None: """Initialize a DumbRemoteHTTPRepo. Args: base_url: Base URL of the remote repository http_request_func: Function to make HTTP requests. """ self.base_url = base_url.rstrip("/") + "/" self._http_request = http_request_func self._refs: dict[Ref, ObjectID] | None = None self._peeled: dict[Ref, ObjectID] | None = None self.object_store = DumbHTTPObjectStore(base_url, http_request_func) def _fetch_url(self, path: str) -> bytes: """Fetch content from a URL path relative to base_url.""" url = urljoin(self.base_url, path) resp, read = self._http_request(url, {}) try: if resp.status == 404: raise OSError(f"Not found: {url}") elif resp.status != 200: raise OSError(f"HTTP error {resp.status}: {url}") chunks = [] while True: chunk = read(4096) if not chunk: break chunks.append(chunk) return b"".join(chunks) finally: resp.close() def get_refs(self) -> dict[Ref, ObjectID]: """Get dictionary with all refs.""" if self._refs is None: # Fetch info/refs try: refs_data = self._fetch_url("info/refs") except OSError: raise NotGitRepository(f"Cannot read refs from {self.base_url}") refs_hex = read_info_refs(BytesIO(refs_data)) # Keep SHAs as hex refs_raw, peeled_raw = split_peeled_refs(refs_hex) # Convert to typed dicts self._refs = {Ref(k): ObjectID(v) for k, v in refs_raw.items()} self._peeled = peeled_raw return dict(self._refs) def get_head(self) -> Ref | None: """Get the current HEAD reference. Returns: HEAD reference name or commit ID """ try: head_resp_bytes = self._fetch_url("HEAD") except OSError as e: if "HTTP error 429" not in str(e): return None else: # rate-limit reached so raise exception raise else: head_split = head_resp_bytes.replace(b"\n", b"").split(b" ") head_target_bytes = head_split[1] if len(head_split) > 1 else head_split[0] # handle HEAD legacy format containing a commit id instead of a ref name for ref_name, ret_target in self.get_refs().items(): if ret_target == head_target_bytes: return ref_name return Ref(head_target_bytes) def get_peeled(self, ref: Ref) -> ObjectID: """Get the peeled value of a ref.""" # For dumb HTTP, we don't have peeled refs readily available # We would need to fetch and parse tag objects sha: ObjectID | None = self.get_refs().get(ref, None) return sha if sha is not None else ZERO_SHA def fetch_pack_data( self, determine_wants: Callable[[Mapping[Ref, ObjectID], int | None], list[ObjectID]], graph_walker: object, progress: Callable[[bytes], None] | None = None, *, get_tagged: bool | None = None, depth: int | None = None, ) -> Iterator[UnpackedObject]: """Fetch pack data from the remote. This is the main method for fetching objects from a dumb HTTP remote. Since dumb HTTP doesn't support negotiation, we need to download all objects reachable from the wanted refs. Args: determine_wants: Function that returns list of wanted SHAs graph_walker: GraphWalker instance (not used for dumb HTTP) progress: Optional progress callback get_tagged: Whether to get tagged objects depth: Depth for shallow clones (not supported for dumb HTTP) Returns: Iterator of UnpackedObject instances """ refs = self.get_refs() wants = determine_wants(refs, depth) if not wants: return # For dumb HTTP, we traverse the object graph starting from wants to_fetch = set(wants) seen = set() while to_fetch: sha = to_fetch.pop() if sha in seen: continue seen.add(sha) # Fetch the object try: type_num, content = self.object_store.get_raw(sha) except KeyError: # Object not found, skip it continue unpacked = UnpackedObject(type_num, sha=sha, decomp_chunks=[content]) yield unpacked # Parse the object to find references to other objects obj = ShaFile.from_raw_string(type_num, content) if isinstance(obj, Commit): # Commit to_fetch.add(obj.tree) for parent in obj.parents: to_fetch.add(parent) elif isinstance(obj, Tag): # Tag to_fetch.add(obj.object[1]) elif isinstance(obj, Tree): # Tree for _, _, item_sha in obj.items(): assert item_sha is not None to_fetch.add(item_sha) if progress: progress(f"Fetching objects: {len(seen)} done\n".encode()) dulwich-1.0.0/dulwich/errors.py000066400000000000000000000220701513301442600164740ustar00rootroot00000000000000# errors.py -- errors for dulwich # Copyright (C) 2007 James Westby # Copyright (C) 2009-2012 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Dulwich-related exception classes and utility functions.""" __all__ = [ "ApplyDeltaError", "ChecksumMismatch", "CommitError", "FileFormatException", "GitProtocolError", "HangupException", "HookError", "MissingCommitError", "NoIndexPresent", "NotBlobError", "NotCommitError", "NotGitRepository", "NotTagError", "NotTreeError", "ObjectFormatException", "ObjectMissing", "PackedRefsException", "RefFormatError", "SendPackError", "UnexpectedCommandError", "WorkingTreeModifiedError", "WrongObjectException", ] # Please do not add more errors here, but instead add them close to the code # that raises the error. import binascii from collections.abc import Sequence class ChecksumMismatch(Exception): """A checksum didn't match the expected contents.""" def __init__( self, expected: bytes | str, got: bytes | str, extra: str | None = None, ) -> None: """Initialize a ChecksumMismatch exception. Args: expected: The expected checksum value (bytes or hex string). got: The actual checksum value (bytes or hex string). extra: Optional additional error information. """ if isinstance(expected, bytes) and len(expected) in (20, 32): expected_str = binascii.hexlify(expected).decode("ascii") else: expected_str = ( expected if isinstance(expected, str) else expected.decode("ascii") ) if isinstance(got, bytes) and len(got) in (20, 32): got_str = binascii.hexlify(got).decode("ascii") else: got_str = got if isinstance(got, str) else got.decode("ascii") self.expected = expected_str self.got = got_str self.extra = extra message = f"Checksum mismatch: Expected {expected_str}, got {got_str}" if self.extra is not None: message += f"; {extra}" Exception.__init__(self, message) class WrongObjectException(Exception): """Baseclass for all the _ is not a _ exceptions on objects. Do not instantiate directly. Subclasses should define a type_name attribute that indicates what was expected if they were raised. """ type_name: str def __init__(self, sha: bytes, *args: object, **kwargs: object) -> None: """Initialize a WrongObjectException. Args: sha: The SHA of the object that was not of the expected type. *args: Additional positional arguments. **kwargs: Additional keyword arguments. """ Exception.__init__(self, f"{sha.decode('ascii')} is not a {self.type_name}") class NotCommitError(WrongObjectException): """Indicates that the sha requested does not point to a commit.""" type_name = "commit" class NotTreeError(WrongObjectException): """Indicates that the sha requested does not point to a tree.""" type_name = "tree" class NotTagError(WrongObjectException): """Indicates that the sha requested does not point to a tag.""" type_name = "tag" class NotBlobError(WrongObjectException): """Indicates that the sha requested does not point to a blob.""" type_name = "blob" class MissingCommitError(Exception): """Indicates that a commit was not found in the repository.""" def __init__(self, sha: bytes, *args: object, **kwargs: object) -> None: """Initialize a MissingCommitError. Args: sha: The SHA of the missing commit. *args: Additional positional arguments. **kwargs: Additional keyword arguments. """ self.sha = sha Exception.__init__(self, f"{sha.decode('ascii')} is not in the revision store") class ObjectMissing(Exception): """Indicates that a requested object is missing.""" def __init__(self, sha: bytes, *args: object, **kwargs: object) -> None: """Initialize an ObjectMissing exception. Args: sha: The SHA of the missing object. *args: Additional positional arguments. **kwargs: Additional keyword arguments. """ Exception.__init__(self, f"{sha.decode('ascii')} is not in the pack") class ApplyDeltaError(Exception): """Indicates that applying a delta failed.""" def __init__(self, *args: object, **kwargs: object) -> None: """Initialize an ApplyDeltaError. Args: *args: Error message and additional positional arguments. **kwargs: Additional keyword arguments. """ Exception.__init__(self, *args, **kwargs) class NotGitRepository(Exception): """Indicates that no Git repository was found.""" def __init__(self, *args: object, **kwargs: object) -> None: """Initialize a NotGitRepository exception. Args: *args: Error message and additional positional arguments. **kwargs: Additional keyword arguments. """ Exception.__init__(self, *args, **kwargs) class GitProtocolError(Exception): """Git protocol exception.""" def __init__(self, *args: object, **kwargs: object) -> None: """Initialize a GitProtocolError. Args: *args: Error message and additional positional arguments. **kwargs: Additional keyword arguments. """ Exception.__init__(self, *args, **kwargs) def __eq__(self, other: object) -> bool: """Check equality between GitProtocolError instances. Args: other: The object to compare with. Returns: True if both are GitProtocolError instances with same args, False otherwise. """ return isinstance(other, GitProtocolError) and self.args == other.args class SendPackError(GitProtocolError): """An error occurred during send_pack.""" class HangupException(GitProtocolError): """Hangup exception.""" def __init__(self, stderr_lines: Sequence[bytes] | None = None) -> None: """Initialize a HangupException. Args: stderr_lines: Optional list of stderr output lines from the remote server. """ if stderr_lines: super().__init__( "\n".join( line.decode("utf-8", "surrogateescape") for line in stderr_lines ) ) else: super().__init__("The remote server unexpectedly closed the connection.") self.stderr_lines = stderr_lines def __eq__(self, other: object) -> bool: """Check equality between HangupException instances. Args: other: The object to compare with. Returns: True if both are HangupException instances with same stderr_lines, False otherwise. """ return ( isinstance(other, HangupException) and self.stderr_lines == other.stderr_lines ) class UnexpectedCommandError(GitProtocolError): """Unexpected command received in a proto line.""" def __init__(self, command: str | None) -> None: """Initialize an UnexpectedCommandError. Args: command: The unexpected command received, or None for flush-pkt. """ command_str = "flush-pkt" if command is None else f"command {command}" super().__init__(f"Protocol got unexpected {command_str}") class FileFormatException(Exception): """Base class for exceptions relating to reading git file formats.""" class PackedRefsException(FileFormatException): """Indicates an error parsing a packed-refs file.""" class ObjectFormatException(FileFormatException): """Indicates an error parsing an object.""" class NoIndexPresent(Exception): """No index is present.""" class CommitError(Exception): """An error occurred while performing a commit.""" class RefFormatError(Exception): """Indicates an invalid ref name.""" class HookError(Exception): """An error occurred while executing a hook.""" class WorkingTreeModifiedError(Exception): """Indicates that the working tree has modifications that would be overwritten.""" dulwich-1.0.0/dulwich/fastexport.py000066400000000000000000000337461513301442600173730ustar00rootroot00000000000000# __init__.py -- Fast export/import functionality # Copyright (C) 2010-2013 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Fast export/import functionality.""" __all__ = [ "GitFastExporter", "GitImportProcessor", "split_email", ] import stat from collections.abc import Generator from typing import TYPE_CHECKING, Any, BinaryIO from fastimport import commands, parser, processor from fastimport import errors as fastimport_errors from .index import commit_tree from .object_store import iter_tree_contents from .objects import ZERO_SHA, Blob, Commit, ObjectID, Tag from .refs import Ref if TYPE_CHECKING: from .object_store import BaseObjectStore from .repo import BaseRepo def split_email(text: bytes) -> tuple[bytes, bytes]: """Split email address from name. Args: text: Full name and email (e.g. b"John Doe ") Returns: Tuple of (name, email) """ # TODO(jelmer): Dedupe this and the same functionality in # format_annotate_line. (name, email) = text.rsplit(b" <", 1) return (name, email.rstrip(b">")) class GitFastExporter: """Generate a fast-export output stream for Git objects.""" def __init__(self, outf: BinaryIO, store: "BaseObjectStore") -> None: """Initialize the fast exporter. Args: outf: Output file to write to store: Object store to export from """ self.outf = outf self.store = store self.markers: dict[bytes, ObjectID] = {} self._marker_idx = 0 def print_cmd(self, cmd: object) -> None: """Print a command to the output stream. Args: cmd: Command object to print """ if hasattr(cmd, "__bytes__"): output = cmd.__bytes__() else: output = cmd.__repr__().encode("utf-8") self.outf.write(output + b"\n") def _allocate_marker(self) -> bytes: """Allocate a new marker. Returns: New marker as bytes """ self._marker_idx += 1 return str(self._marker_idx).encode("ascii") def _export_blob(self, blob: Blob) -> tuple[Any, bytes]: """Export a blob object. Args: blob: Blob object to export Returns: Tuple of (BlobCommand, marker) """ marker = self._allocate_marker() self.markers[marker] = blob.id return (commands.BlobCommand(marker, blob.data), marker) # type: ignore[no-untyped-call,unused-ignore] def emit_blob(self, blob: Blob) -> bytes: """Emit a blob to the output stream. Args: blob: Blob object to emit Returns: Marker for the blob """ (cmd, marker) = self._export_blob(blob) self.print_cmd(cmd) return marker def _iter_files( self, base_tree: ObjectID | None, new_tree: ObjectID | None ) -> Generator[Any, None, None]: for ( (old_path, new_path), (old_mode, new_mode), (old_hexsha, new_hexsha), ) in self.store.tree_changes(base_tree, new_tree): if new_path is None: if old_path is not None: yield commands.FileDeleteCommand(old_path) # type: ignore[no-untyped-call,unused-ignore] continue marker = b"" if new_mode is not None and not stat.S_ISDIR(new_mode): if new_hexsha is not None: blob = self.store[new_hexsha] from .objects import Blob if isinstance(blob, Blob): marker = self.emit_blob(blob) if old_path != new_path and old_path is not None: yield commands.FileRenameCommand(old_path, new_path) # type: ignore[no-untyped-call,unused-ignore] if old_mode != new_mode or old_hexsha != new_hexsha: prefixed_marker = b":" + marker assert new_mode is not None yield commands.FileModifyCommand( # type: ignore[no-untyped-call,unused-ignore] new_path, new_mode, prefixed_marker, None ) def _export_commit( self, commit: Commit, ref: Ref, base_tree: ObjectID | None = None ) -> tuple[Any, bytes]: file_cmds = list(self._iter_files(base_tree, commit.tree)) marker = self._allocate_marker() if commit.parents: from_ = commit.parents[0] merges = commit.parents[1:] else: from_ = None merges = [] author, author_email = split_email(commit.author) committer, committer_email = split_email(commit.committer) cmd = commands.CommitCommand( # type: ignore[no-untyped-call,unused-ignore] ref, marker, (author, author_email, commit.author_time, commit.author_timezone), ( committer, committer_email, commit.commit_time, commit.commit_timezone, ), commit.message, from_, merges, file_cmds, ) return (cmd, marker) def emit_commit( self, commit: Commit, ref: Ref, base_tree: ObjectID | None = None ) -> bytes: """Emit a commit in fast-export format. Args: commit: Commit object to export ref: Reference name for the commit base_tree: Base tree for incremental export Returns: Marker for the commit """ cmd, marker = self._export_commit(commit, ref, base_tree) self.print_cmd(cmd) return marker class GitImportProcessor(processor.ImportProcessor): # type: ignore[misc,unused-ignore] """An import processor that imports into a Git repository using Dulwich.""" # FIXME: Batch creation of objects? def __init__( self, repo: "BaseRepo", params: Any | None = None, # noqa: ANN401 verbose: bool = False, outf: BinaryIO | None = None, ) -> None: """Initialize GitImportProcessor. Args: repo: Repository to import into params: Import parameters verbose: Whether to enable verbose output outf: Output file for verbose messages """ processor.ImportProcessor.__init__(self, params, verbose) # type: ignore[no-untyped-call,unused-ignore] self.repo = repo self.last_commit = ZERO_SHA self.markers: dict[bytes, ObjectID] = {} self._contents: dict[bytes, tuple[int, bytes]] = {} def lookup_object(self, objectish: bytes) -> ObjectID: """Look up an object by reference or marker. Args: objectish: Object reference or marker Returns: Object ID """ if objectish.startswith(b":"): return self.markers[objectish[1:]] return ObjectID(objectish) def import_stream(self, stream: BinaryIO) -> dict[bytes, ObjectID]: """Import from a fast-import stream. Args: stream: Stream to import from Returns: Dictionary of markers to object IDs """ p = parser.ImportParser(stream) # type: ignore[no-untyped-call,unused-ignore] self.process(p.iter_commands) # type: ignore[no-untyped-call,unused-ignore] return self.markers def blob_handler(self, cmd: commands.BlobCommand) -> None: """Process a BlobCommand.""" blob = Blob.from_string(cmd.data) self.repo.object_store.add_object(blob) if cmd.mark: self.markers[cmd.mark] = blob.id def checkpoint_handler(self, cmd: commands.CheckpointCommand) -> None: """Process a CheckpointCommand.""" def commit_handler(self, cmd: commands.CommitCommand) -> None: """Process a CommitCommand.""" commit = Commit() if cmd.author is not None: (author_name, author_email, author_timestamp, author_timezone) = cmd.author else: (author_name, author_email, author_timestamp, author_timezone) = ( cmd.committer ) ( committer_name, committer_email, commit_timestamp, commit_timezone, ) = cmd.committer if isinstance(author_name, str): author_name = author_name.encode("utf-8") if isinstance(author_email, str): author_email = author_email.encode("utf-8") commit.author = author_name + b" <" + author_email + b">" commit.author_timezone = author_timezone commit.author_time = int(author_timestamp) if isinstance(committer_name, str): committer_name = committer_name.encode("utf-8") if isinstance(committer_email, str): committer_email = committer_email.encode("utf-8") commit.committer = committer_name + b" <" + committer_email + b">" commit.commit_timezone = commit_timezone commit.commit_time = int(commit_timestamp) commit.message = cmd.message commit.parents = [] if cmd.from_: cmd.from_ = self.lookup_object(cmd.from_) self._reset_base(cmd.from_) for filecmd in cmd.iter_files(): # type: ignore[no-untyped-call,unused-ignore] if filecmd.name == b"filemodify": assert isinstance(filecmd, commands.FileModifyCommand) if filecmd.data is not None: blob = Blob.from_string(filecmd.data) self.repo.object_store.add_object(blob) blob_id = blob.id else: assert filecmd.dataref is not None blob_id = self.lookup_object(filecmd.dataref) self._contents[filecmd.path] = (filecmd.mode, blob_id) elif filecmd.name == b"filedelete": assert isinstance(filecmd, commands.FileDeleteCommand) del self._contents[filecmd.path] elif filecmd.name == b"filecopy": assert isinstance(filecmd, commands.FileCopyCommand) self._contents[filecmd.dest_path] = self._contents[filecmd.src_path] elif filecmd.name == b"filerename": assert isinstance(filecmd, commands.FileRenameCommand) self._contents[filecmd.new_path] = self._contents[filecmd.old_path] del self._contents[filecmd.old_path] elif filecmd.name == b"filedeleteall": self._contents = {} else: raise Exception(f"Command {filecmd.name!r} not supported") from dulwich.objects import ObjectID commit.tree = commit_tree( self.repo.object_store, ( (path, ObjectID(hexsha), mode) for (path, (mode, hexsha)) in self._contents.items() ), ) if self.last_commit != ZERO_SHA: commit.parents.append(self.last_commit) for merge in cmd.merges: commit.parents.append(self.lookup_object(merge)) self.repo.object_store.add_object(commit) self.repo[cmd.ref] = commit.id self.last_commit = commit.id if cmd.mark: mark_bytes = ( cmd.mark if isinstance(cmd.mark, bytes) else str(cmd.mark).encode("ascii") ) self.markers[mark_bytes] = commit.id def progress_handler(self, cmd: commands.ProgressCommand) -> None: """Process a ProgressCommand.""" def _reset_base(self, commit_id: ObjectID) -> None: if self.last_commit == commit_id: return self._contents = {} self.last_commit = commit_id if commit_id != ZERO_SHA: from .objects import Commit commit = self.repo[commit_id] tree_id = commit.tree if isinstance(commit, Commit) else None if tree_id is None: return for ( path, mode, hexsha, ) in iter_tree_contents(self.repo.object_store, tree_id): assert path is not None and mode is not None and hexsha is not None self._contents[path] = (mode, hexsha) def reset_handler(self, cmd: commands.ResetCommand) -> None: """Process a ResetCommand.""" from_: ObjectID if cmd.from_ is None: from_ = ZERO_SHA else: from_ = self.lookup_object(cmd.from_) self._reset_base(from_) self.repo.refs[Ref(cmd.ref)] = from_ def tag_handler(self, cmd: commands.TagCommand) -> None: """Process a TagCommand.""" tag = Tag() tag.tagger = cmd.tagger tag.message = cmd.message tag.name = cmd.from_ self.repo.object_store.add_object(tag) self.repo.refs["refs/tags/" + tag.name] = tag.id def feature_handler(self, cmd: commands.FeatureCommand) -> None: """Process a FeatureCommand.""" feature_name = ( cmd.feature_name.decode("utf-8") if isinstance(cmd.feature_name, bytes) else cmd.feature_name ) raise fastimport_errors.UnknownFeature(feature_name) # type: ignore[no-untyped-call,unused-ignore] dulwich-1.0.0/dulwich/file.py000066400000000000000000000256201513301442600161030ustar00rootroot00000000000000# file.py -- Safe access to git files # Copyright (C) 2010 Google, Inc. # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Safe access to git files.""" __all__ = [ "FileLocked", "GitFile", "ensure_dir_exists", ] import os import sys import warnings from collections.abc import Iterable, Iterator from types import TracebackType from typing import IO, Any, ClassVar, Literal, overload from ._typing import Buffer def ensure_dir_exists( dirname: str | bytes | os.PathLike[str] | os.PathLike[bytes], ) -> None: """Ensure a directory exists, creating if necessary.""" try: os.makedirs(dirname) except FileExistsError: pass def _fancy_rename(oldname: str | bytes, newname: str | bytes) -> None: """Rename file with temporary backup file to rollback if rename fails.""" if not os.path.exists(newname): os.rename(oldname, newname) return # Defer the tempfile import since it pulls in a lot of other things. import tempfile # destination file exists (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=str(oldname), dir=".") os.close(fd) os.remove(tmpfile) os.rename(newname, tmpfile) try: os.rename(oldname, newname) except OSError: os.rename(tmpfile, newname) raise os.remove(tmpfile) @overload def GitFile( filename: str | bytes | os.PathLike[str] | os.PathLike[bytes], mode: Literal["wb"], bufsize: int = -1, mask: int = 0o644, fsync: bool = True, ) -> "_GitFile": ... @overload def GitFile( filename: str | bytes | os.PathLike[str] | os.PathLike[bytes], mode: Literal["rb"] = "rb", bufsize: int = -1, mask: int = 0o644, fsync: bool = True, ) -> IO[bytes]: ... @overload def GitFile( filename: str | bytes | os.PathLike[str] | os.PathLike[bytes], mode: str = "rb", bufsize: int = -1, mask: int = 0o644, fsync: bool = True, ) -> "IO[bytes] | _GitFile": ... def GitFile( filename: str | bytes | os.PathLike[str] | os.PathLike[bytes], mode: str = "rb", bufsize: int = -1, mask: int = 0o644, fsync: bool = True, ) -> "IO[bytes] | _GitFile": """Create a file object that obeys the git file locking protocol. Returns: a builtin file object or a _GitFile object Note: See _GitFile for a description of the file locking protocol. Only read-only and write-only (binary) modes are supported; r+, w+, and a are not. To read and write from the same file, you can take advantage of the fact that opening a file for write does not actually open the file you request. The default file mask makes any created files user-writable and world-readable. Args: filename: Path to the file mode: File mode (only 'rb' and 'wb' are supported) bufsize: Buffer size for file operations mask: File mask for created files fsync: Whether to call fsync() before closing (default: True) """ if "a" in mode: raise OSError("append mode not supported for Git files") if "+" in mode: raise OSError("read/write mode not supported for Git files") if "b" not in mode: raise OSError("text mode not supported for Git files") if "w" in mode: return _GitFile(filename, mode, bufsize, mask, fsync) else: return open(filename, mode, bufsize) class FileLocked(Exception): """File is already locked.""" def __init__( self, filename: str | bytes, lockfilename: str | bytes, ) -> None: """Initialize FileLocked. Args: filename: Name of the file that is locked lockfilename: Name of the lock file """ self.filename = filename self.lockfilename = lockfilename super().__init__(filename, lockfilename) class _GitFile(IO[bytes]): """File that follows the git locking protocol for writes. All writes to a file foo will be written into foo.lock in the same directory, and the lockfile will be renamed to overwrite the original file on close. Note: You *must* call close() or abort() on a _GitFile for the lock to be released. Typically this will happen in a finally block. """ _file: IO[bytes] _filename: str | bytes _lockfilename: str | bytes _closed: bool PROXY_PROPERTIES: ClassVar[set[str]] = { "encoding", "errors", "mode", "name", "newlines", "softspace", } PROXY_METHODS: ClassVar[set[str]] = { "__iter__", "__next__", "flush", "fileno", "isatty", "read", "readable", "readline", "readlines", "seek", "seekable", "tell", "truncate", "writable", "write", "writelines", } def __init__( self, filename: str | bytes | os.PathLike[str] | os.PathLike[bytes], mode: str, bufsize: int, mask: int, fsync: bool = True, ) -> None: # Convert PathLike to str/bytes for our internal use self._filename: str | bytes = os.fspath(filename) self._fsync = fsync if isinstance(self._filename, bytes): self._lockfilename: str | bytes = self._filename + b".lock" else: self._lockfilename = self._filename + ".lock" try: fd = os.open( self._lockfilename, os.O_RDWR | os.O_CREAT | os.O_EXCL | getattr(os, "O_BINARY", 0), mask, ) except FileExistsError as exc: raise FileLocked(self._filename, self._lockfilename) from exc self._file = os.fdopen(fd, mode, bufsize) self._closed = False def __iter__(self) -> Iterator[bytes]: """Iterate over lines in the file.""" return iter(self._file) def abort(self) -> None: """Close and discard the lockfile without overwriting the target. If the file is already closed, this is a no-op. """ if self._closed: return self._file.close() try: os.remove(self._lockfilename) self._closed = True except FileNotFoundError: # The file may have been removed already, which is ok. self._closed = True def close(self) -> None: """Close this file, saving the lockfile over the original. Note: If this method fails, it will attempt to delete the lockfile. However, it is not guaranteed to do so (e.g. if a filesystem becomes suddenly read-only), which will prevent future writes to this file until the lockfile is removed manually. Raises: OSError: if the original file could not be overwritten. The lock file is still closed, so further attempts to write to the same file object will raise ValueError. """ if self._closed: return self._file.flush() if self._fsync: os.fsync(self._file.fileno()) self._file.close() try: if getattr(os, "replace", None) is not None: os.replace(self._lockfilename, self._filename) else: if sys.platform != "win32": os.rename(self._lockfilename, self._filename) else: # Windows versions prior to Vista don't support atomic # renames _fancy_rename(self._lockfilename, self._filename) finally: self.abort() def __del__(self) -> None: if not getattr(self, "_closed", True): warnings.warn(f"unclosed {self!r}", ResourceWarning, stacklevel=2) self.abort() def __enter__(self) -> "_GitFile": return self def __exit__( self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None, ) -> None: if exc_type is not None: self.abort() else: self.close() def __fspath__(self) -> str | bytes: """Return the file path for os.fspath() compatibility.""" return self._filename @property def closed(self) -> bool: """Return whether the file is closed.""" return self._closed def __getattr__(self, name: str) -> Any: # noqa: ANN401 """Proxy property calls to the underlying file.""" if name in self.PROXY_PROPERTIES: return getattr(self._file, name) raise AttributeError(name) # Implement IO[bytes] methods by delegating to the underlying file def read(self, size: int = -1) -> bytes: return self._file.read(size) # TODO: Remove type: ignore when Python 3.10 support is dropped (Oct 2026) # Python 3.10 has issues with IO[bytes] overload signatures def write(self, data: Buffer, /) -> int: # type: ignore[override,unused-ignore] return self._file.write(data) def readline(self, size: int = -1) -> bytes: return self._file.readline(size) def readlines(self, hint: int = -1) -> list[bytes]: return self._file.readlines(hint) # TODO: Remove type: ignore when Python 3.10 support is dropped (Oct 2026) # Python 3.10 has issues with IO[bytes] overload signatures def writelines(self, lines: Iterable[Buffer], /) -> None: # type: ignore[override,unused-ignore] return self._file.writelines(lines) def seek(self, offset: int, whence: int = 0) -> int: return self._file.seek(offset, whence) def tell(self) -> int: return self._file.tell() def flush(self) -> None: return self._file.flush() def truncate(self, size: int | None = None) -> int: return self._file.truncate(size) def fileno(self) -> int: return self._file.fileno() def isatty(self) -> bool: return self._file.isatty() def readable(self) -> bool: return self._file.readable() def writable(self) -> bool: return self._file.writable() def seekable(self) -> bool: return self._file.seekable() def __next__(self) -> bytes: return next(iter(self._file)) dulwich-1.0.0/dulwich/filter_branch.py000066400000000000000000000503141513301442600177640ustar00rootroot00000000000000# filter_branch.py - Git filter-branch functionality # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git filter-branch implementation.""" __all__ = [ "CommitData", "CommitFilter", "filter_refs", ] import os import tempfile import warnings from collections.abc import Callable, Sequence from typing import TypedDict from .index import Index, build_index_from_tree from .object_store import BaseObjectStore from .objects import Commit, ObjectID, Tag, Tree from .refs import Ref, RefsContainer, local_tag_name class CommitData(TypedDict, total=False): """TypedDict for commit data fields.""" author: bytes author_time: int author_timezone: int committer: bytes commit_time: int commit_timezone: int message: bytes encoding: bytes class CommitFilter: """Filter for rewriting commits during filter-branch operations.""" def __init__( self, object_store: BaseObjectStore, *, filter_fn: Callable[[Commit], CommitData | None] | None = None, filter_author: Callable[[bytes], bytes | None] | None = None, filter_committer: Callable[[bytes], bytes | None] | None = None, filter_message: Callable[[bytes], bytes | None] | None = None, tree_filter: Callable[[ObjectID, str], ObjectID | None] | None = None, index_filter: Callable[[ObjectID, str], ObjectID | None] | None = None, parent_filter: Callable[[Sequence[ObjectID]], list[ObjectID]] | None = None, commit_filter: Callable[[Commit, ObjectID], ObjectID | None] | None = None, subdirectory_filter: bytes | None = None, prune_empty: bool = False, tag_name_filter: Callable[[bytes], bytes | None] | None = None, ): """Initialize a commit filter. Args: object_store: Object store to read from and write to filter_fn: Optional callable that takes a Commit object and returns a dict of updated fields (author, committer, message, etc.) filter_author: Optional callable that takes author bytes and returns updated author bytes or None to keep unchanged filter_committer: Optional callable that takes committer bytes and returns updated committer bytes or None to keep unchanged filter_message: Optional callable that takes commit message bytes and returns updated message bytes tree_filter: Optional callable that takes (tree_sha, temp_dir) and returns new tree SHA after modifying working directory index_filter: Optional callable that takes (tree_sha, temp_index_path) and returns new tree SHA after modifying index parent_filter: Optional callable that takes parent list and returns modified parent list commit_filter: Optional callable that takes (Commit, tree_sha) and returns new commit SHA or None to skip commit subdirectory_filter: Optional subdirectory path to extract as new root prune_empty: Whether to prune commits that become empty tag_name_filter: Optional callable to rename tags """ self.object_store = object_store self.filter_fn = filter_fn self.filter_author = filter_author self.filter_committer = filter_committer self.filter_message = filter_message self.tree_filter = tree_filter self.index_filter = index_filter self.parent_filter = parent_filter self.commit_filter = commit_filter self.subdirectory_filter = subdirectory_filter self.prune_empty = prune_empty self.tag_name_filter = tag_name_filter self._old_to_new: dict[ObjectID, ObjectID] = {} self._processed: set[ObjectID] = set() self._tree_cache: dict[ObjectID, ObjectID] = {} # Cache for filtered trees def _filter_tree_with_subdirectory( self, tree_sha: ObjectID, subdirectory: bytes ) -> ObjectID | None: """Extract a subdirectory from a tree as the new root. Args: tree_sha: SHA of the tree to filter subdirectory: Path to subdirectory to extract Returns: SHA of the new tree containing only the subdirectory, or None if not found """ try: tree = self.object_store[tree_sha] if not isinstance(tree, Tree): return None except KeyError: return None # Split subdirectory path parts = subdirectory.split(b"/") current_tree = tree # Navigate to subdirectory for part in parts: if not part: continue found = False for entry in current_tree.items(): if entry.path == part: try: assert entry.sha is not None obj = self.object_store[entry.sha] if isinstance(obj, Tree): current_tree = obj found = True break except KeyError: return None if not found: # Subdirectory not found, return empty tree empty_tree = Tree() self.object_store.add_object(empty_tree) return empty_tree.id # Return the subdirectory tree return current_tree.id def _apply_tree_filter(self, tree_sha: ObjectID) -> ObjectID: """Apply tree filter by checking out tree and running filter. Args: tree_sha: SHA of the tree to filter Returns: SHA of the filtered tree """ if tree_sha in self._tree_cache: return self._tree_cache[tree_sha] if not self.tree_filter: self._tree_cache[tree_sha] = tree_sha return tree_sha # Create temporary directory with tempfile.TemporaryDirectory() as tmpdir: # Check out tree to temp directory # We need a proper checkout implementation here # For now, pass tmpdir to filter and let it handle checkout new_tree_sha = self.tree_filter(tree_sha, tmpdir) if new_tree_sha is None: new_tree_sha = tree_sha self._tree_cache[tree_sha] = new_tree_sha return new_tree_sha def _apply_index_filter(self, tree_sha: ObjectID) -> ObjectID: """Apply index filter by creating temp index and running filter. Args: tree_sha: SHA of the tree to filter Returns: SHA of the filtered tree """ if tree_sha in self._tree_cache: return self._tree_cache[tree_sha] if not self.index_filter: self._tree_cache[tree_sha] = tree_sha return tree_sha # Create temporary index file with tempfile.NamedTemporaryFile(delete=False) as tmp_index: tmp_index_path = tmp_index.name try: # Build index from tree build_index_from_tree(".", tmp_index_path, self.object_store, tree_sha) # Run index filter new_tree_sha = self.index_filter(tree_sha, tmp_index_path) if new_tree_sha is None: # Read back the modified index and create new tree index = Index(tmp_index_path) new_tree_sha = index.commit(self.object_store) self._tree_cache[tree_sha] = new_tree_sha return new_tree_sha finally: os.unlink(tmp_index_path) def process_commit(self, commit_sha: ObjectID) -> ObjectID | None: """Process a single commit, creating a filtered version. Args: commit_sha: SHA of the commit to process Returns: SHA of the new commit, or None if object not found """ if commit_sha in self._processed: return self._old_to_new.get(commit_sha, commit_sha) self._processed.add(commit_sha) try: commit = self.object_store[commit_sha] except KeyError: # Object not found return None if not isinstance(commit, Commit): # Not a commit, return as-is self._old_to_new[commit_sha] = commit_sha return commit_sha # Process parents first new_parents = [] for parent in commit.parents: new_parent = self.process_commit(parent) if new_parent: # Skip None parents new_parents.append(new_parent) # Apply parent filter if self.parent_filter: new_parents = self.parent_filter(new_parents) # Apply tree filters new_tree = commit.tree # Subdirectory filter takes precedence if self.subdirectory_filter: filtered_tree = self._filter_tree_with_subdirectory( commit.tree, self.subdirectory_filter ) if filtered_tree: new_tree = filtered_tree # Then apply tree filter if self.tree_filter: new_tree = self._apply_tree_filter(new_tree) # Or apply index filter elif self.index_filter: new_tree = self._apply_index_filter(new_tree) # Check if we should prune empty commits if self.prune_empty and len(new_parents) == 1: # Check if tree is same as parent's tree parent_commit = self.object_store[new_parents[0]] if isinstance(parent_commit, Commit) and parent_commit.tree == new_tree: # This commit doesn't change anything, skip it self._old_to_new[commit_sha] = new_parents[0] return new_parents[0] # Apply filters new_data: CommitData = {} # Custom filter function takes precedence if self.filter_fn: filtered = self.filter_fn(commit) if filtered: new_data.update(filtered) # Apply specific filters if self.filter_author and "author" not in new_data: new_author = self.filter_author(commit.author) if new_author is not None: new_data["author"] = new_author if self.filter_committer and "committer" not in new_data: new_committer = self.filter_committer(commit.committer) if new_committer is not None: new_data["committer"] = new_committer if self.filter_message and "message" not in new_data: new_message = self.filter_message(commit.message) if new_message is not None: new_data["message"] = new_message # Create new commit if anything changed if new_data or new_parents != commit.parents or new_tree != commit.tree: new_commit = Commit() new_commit.tree = new_tree new_commit.parents = new_parents new_commit.author = new_data.get("author", commit.author) new_commit.author_time = new_data.get("author_time", commit.author_time) new_commit.author_timezone = new_data.get( "author_timezone", commit.author_timezone ) new_commit.committer = new_data.get("committer", commit.committer) new_commit.commit_time = new_data.get("commit_time", commit.commit_time) new_commit.commit_timezone = new_data.get( "commit_timezone", commit.commit_timezone ) new_commit.message = new_data.get("message", commit.message) new_commit.encoding = new_data.get("encoding", commit.encoding) # Copy extra fields if hasattr(commit, "_author_timezone_neg_utc"): new_commit._author_timezone_neg_utc = commit._author_timezone_neg_utc if hasattr(commit, "_commit_timezone_neg_utc"): new_commit._commit_timezone_neg_utc = commit._commit_timezone_neg_utc if hasattr(commit, "_extra"): new_commit._extra = list(commit._extra) if hasattr(commit, "_gpgsig"): new_commit._gpgsig = commit._gpgsig if hasattr(commit, "_mergetag"): new_commit._mergetag = list(commit._mergetag) # Apply commit filter if provided if self.commit_filter: # The commit filter can create a completely new commit new_commit_sha = self.commit_filter(new_commit, new_tree) if new_commit_sha is None: # Skip this commit if len(new_parents) == 1: self._old_to_new[commit_sha] = new_parents[0] return new_parents[0] elif len(new_parents) == 0: return None else: # Multiple parents, can't skip # Store the new commit anyway self.object_store.add_object(new_commit) self._old_to_new[commit_sha] = new_commit.id return new_commit.id else: self._old_to_new[commit_sha] = new_commit_sha return new_commit_sha else: # Store the new commit self.object_store.add_object(new_commit) self._old_to_new[commit_sha] = new_commit.id return new_commit.id else: # No changes, keep original self._old_to_new[commit_sha] = commit_sha return commit_sha def get_mapping(self) -> dict[ObjectID, ObjectID]: """Get the mapping of old commit SHAs to new commit SHAs. Returns: Dictionary mapping old SHAs to new SHAs """ return self._old_to_new.copy() def filter_refs( refs: RefsContainer, object_store: BaseObjectStore, ref_names: Sequence[bytes], commit_filter: CommitFilter, *, keep_original: bool = True, force: bool = False, tag_callback: Callable[[Ref, Ref], None] | None = None, ) -> dict[ObjectID, ObjectID]: """Filter commits reachable from the given refs. Args: refs: Repository refs container object_store: Object store containing commits ref_names: List of ref names to filter commit_filter: CommitFilter instance to use keep_original: Keep original refs under refs/original/ force: Force operation even if refs have been filtered before tag_callback: Optional callback for processing tags Returns: Dictionary mapping old commit SHAs to new commit SHAs Raises: ValueError: If refs have already been filtered and force is False """ # Check if already filtered if keep_original and not force: for ref in ref_names: original_ref = Ref(b"refs/original/" + ref) if original_ref in refs: raise ValueError( f"Branch {ref.decode()} appears to have been filtered already. " "Use force=True to force re-filtering." ) # Process commits starting from refs for ref in ref_names: try: # Get the commit SHA for this ref ref_obj = Ref(ref) if ref_obj in refs: ref_sha = refs[ref_obj] if ref_sha: commit_filter.process_commit(ref_sha) except KeyError: # Skip refs that can't be resolved warnings.warn(f"Could not process ref {ref!r}: ref not found") continue # Update refs mapping = commit_filter.get_mapping() for ref in ref_names: try: ref_obj = Ref(ref) if ref_obj in refs: old_sha = refs[ref_obj] new_sha = mapping.get(old_sha, old_sha) if old_sha != new_sha: # Save original ref if requested if keep_original: original_ref = Ref(b"refs/original/" + ref) refs[original_ref] = old_sha # Update ref to new commit refs[ref_obj] = new_sha except KeyError: # Not a valid ref, skip updating warnings.warn(f"Could not update ref {ref!r}: ref not found") continue # Handle tag filtering if commit_filter.tag_name_filter and tag_callback: # Process all tags for ref in refs.allkeys(): if ref.startswith(b"refs/tags/"): # Get the tag object or commit it points to tag_sha = refs[ref] tag_obj = object_store[tag_sha] tag_name = ref[10:] # Remove 'refs/tags/' # Check if it's an annotated tag if isinstance(tag_obj, Tag): # Get the commit it points to target_sha = tag_obj.object[1] # Process tag if: # 1. It points to a rewritten commit, OR # 2. We want to rename the tag regardless if ( target_sha in mapping or commit_filter.tag_name_filter is not None ): new_tag_name = commit_filter.tag_name_filter(tag_name) if new_tag_name and new_tag_name != tag_name: # For annotated tags pointing to rewritten commits, # we need to create a new tag object if target_sha in mapping: new_target = mapping[target_sha] # Create new tag object pointing to rewritten commit new_tag = Tag() new_tag.object = (tag_obj.object[0], new_target) new_tag.name = new_tag_name new_tag.message = tag_obj.message new_tag.tagger = tag_obj.tagger new_tag.tag_time = tag_obj.tag_time new_tag.tag_timezone = tag_obj.tag_timezone object_store.add_object(new_tag) # Update ref to point to new tag object refs[local_tag_name(new_tag_name)] = new_tag.id # Delete old tag del refs[ref] else: # Just rename the tag new_ref = local_tag_name(new_tag_name) tag_callback(ref, new_ref) elif isinstance(tag_obj, Commit): # Lightweight tag - points directly to a commit # Process if commit was rewritten or we want to rename if tag_sha in mapping or commit_filter.tag_name_filter is not None: new_tag_name = commit_filter.tag_name_filter(tag_name) if new_tag_name and new_tag_name != tag_name: new_ref = local_tag_name(new_tag_name) if tag_sha in mapping: # Point to rewritten commit refs[new_ref] = mapping[tag_sha] del refs[ref] else: # Just rename tag_callback(ref, new_ref) return mapping dulwich-1.0.0/dulwich/filters.py000066400000000000000000001112641513301442600166340ustar00rootroot00000000000000# filters.py -- Git filter drivers (clean/smudge) implementation # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Implementation of Git filter drivers (clean/smudge filters).""" __all__ = [ "CompositeFilterDriver", "FilterBlobNormalizer", "FilterContext", "FilterDriver", "FilterError", "FilterRegistry", "ProcessFilterDriver", "get_filter_for_path", ] import logging import shlex import subprocess import threading from collections.abc import Callable from typing import TYPE_CHECKING from typing import Protocol as TypingProtocol from .attrs import GitAttributes from .objects import Blob if TYPE_CHECKING: from .config import StackedConfig from .protocol import Protocol from .repo import BaseRepo class FilterError(Exception): """Exception raised when filter operations fail.""" class FilterDriver(TypingProtocol): """Protocol for filter drivers.""" def clean(self, data: bytes) -> bytes: """Apply clean filter (working tree → repository).""" ... def smudge(self, data: bytes, path: bytes = b"") -> bytes: """Apply smudge filter (repository → working tree).""" ... def cleanup(self) -> None: """Clean up any resources held by this filter driver.""" ... def reuse(self, config: "StackedConfig", filter_name: str) -> bool: """Check if this filter driver should be reused with the given configuration. This method determines whether a cached filter driver instance should continue to be used or if it should be recreated. Only filters that are expensive to create (like long-running process filters) and whose configuration hasn't changed should return True. Lightweight filters should return False to ensure they always use the latest configuration. Args: config: The current configuration stack filter_name: The name of the filter in config Returns: True if the filter should be reused, False if it should be recreated """ ... class CompositeFilterDriver: """Filter driver that chains multiple filters together.""" def __init__(self, filters: list[FilterDriver]) -> None: """Initialize CompositeFilterDriver. Args: filters: List of filters to apply in order """ self.filters = filters def clean(self, data: bytes) -> bytes: """Apply all clean filters in order.""" for filter_driver in self.filters: data = filter_driver.clean(data) return data def smudge(self, data: bytes, path: bytes = b"") -> bytes: """Apply all smudge filters in reverse order.""" # For smudge, apply filters in reverse order for filter_driver in reversed(self.filters): data = filter_driver.smudge(data, path) return data def cleanup(self) -> None: """Clean up all filter drivers.""" for filter_driver in self.filters: filter_driver.cleanup() def reuse(self, config: "StackedConfig", filter_name: str) -> bool: """Check if all filters can be reused.""" # A composite filter can only be reused if all its components can return all(f.reuse(config, filter_name) for f in self.filters) class ProcessFilterDriver: """Filter driver that executes external processes.""" def __init__( self, clean_cmd: str | None = None, smudge_cmd: str | None = None, required: bool = False, cwd: str | None = None, process_cmd: str | None = None, ) -> None: """Initialize ProcessFilterDriver. Args: clean_cmd: Command to run for clean filter smudge_cmd: Command to run for smudge filter required: Whether the filter is required cwd: Working directory for filter execution process_cmd: Command to run for process filter (preferred for performance) """ self.clean_cmd = clean_cmd self.smudge_cmd = smudge_cmd self.required = required self.cwd = cwd self.process_cmd = process_cmd self._process: subprocess.Popen[bytes] | None = None self._protocol: Protocol | None = None self._capabilities: set[bytes] = set() self._process_lock = threading.Lock() def _get_or_start_process(self) -> "Protocol | None": """Get or start the long-running process filter.""" if self._process is None and self.process_cmd: from .errors import GitProtocolError, HangupException from .protocol import Protocol try: self._process = subprocess.Popen( self.process_cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.cwd, text=False, # Use bytes ) # Check if process started successfully if self._process.poll() is not None: # Process already terminated raise OSError( f"Process terminated immediately with code {self._process.returncode}" ) # Create protocol wrapper def write_func(data: bytes) -> int: assert self._process is not None assert self._process.stdin is not None n = self._process.stdin.write(data) self._process.stdin.flush() return n def read_func(size: int) -> bytes: assert self._process is not None assert self._process.stdout is not None return self._process.stdout.read(size) self._protocol = Protocol(read_func, write_func) # Send handshake using pkt-line format self._protocol.write_pkt_line(b"git-filter-client") self._protocol.write_pkt_line(b"version=2") self._protocol.write_pkt_line(None) # flush packet # Read handshake response welcome = self._protocol.read_pkt_line() version = self._protocol.read_pkt_line() flush = self._protocol.read_pkt_line() # Verify handshake (be liberal - accept with or without newlines) if welcome and welcome.rstrip(b"\n\r") != b"git-filter-server": raise FilterError(f"Invalid welcome message: {welcome!r}") if version and version.rstrip(b"\n\r") != b"version=2": raise FilterError(f"Invalid version: {version!r}") if flush is not None: raise FilterError("Expected flush packet after handshake") # Send capabilities self._protocol.write_pkt_line(b"capability=clean") self._protocol.write_pkt_line(b"capability=smudge") self._protocol.write_pkt_line(None) # flush packet # Read capability response capabilities = [] while True: pkt = self._protocol.read_pkt_line() if pkt is None: # flush packet break capabilities.append(pkt) # Store supported capabilities self._capabilities = set() for cap in capabilities: cap = cap.rstrip(b"\n\r") # Be liberal - strip any line endings if cap.startswith(b"capability="): self._capabilities.add(cap[11:]) # Remove "capability=" prefix except ( OSError, subprocess.SubprocessError, HangupException, GitProtocolError, ) as e: self.cleanup() raise FilterError(f"Failed to start process filter: {e}") return self._protocol def _use_process_filter(self, data: bytes, operation: str, path: str = "") -> bytes: """Use the long-running process filter for the operation.""" with self._process_lock: try: proc = self._get_or_start_process() if proc is None: return data operation_bytes = operation.encode() if operation_bytes not in self._capabilities: raise FilterError(f"Operation {operation} not supported by filter") if not self._protocol: raise FilterError("Protocol not initialized") # Send request using pkt-line format self._protocol.write_pkt_line(f"command={operation}".encode()) self._protocol.write_pkt_line(f"pathname={path}".encode()) self._protocol.write_pkt_line(None) # flush packet # Send data # Split data into chunks if needed (max pkt-line payload is 65516 bytes) chunk_size = 65516 for i in range(0, len(data), chunk_size): chunk = data[i : i + chunk_size] self._protocol.write_pkt_line(chunk) self._protocol.write_pkt_line(None) # flush packet to end data # Read response (initial headers) response_headers = {} while True: pkt = self._protocol.read_pkt_line() if pkt is None: # flush packet ends headers break key, _, value = pkt.rstrip(b"\n\r").partition(b"=") response_headers[key] = value # Check status status = response_headers.get(b"status", b"error") if status != b"success": status_str = status.decode("utf-8", errors="replace") raise FilterError( f"Process filter {operation} failed: {status_str}" ) # Read result data result_chunks = [] while True: pkt = self._protocol.read_pkt_line() if pkt is None: # flush packet ends data break result_chunks.append(pkt) # Read final headers per Git filter protocol # Filters send: headers + flush + content + flush + final_headers + flush final_headers = {} while True: pkt = self._protocol.read_pkt_line() if pkt is None: # flush packet ends final headers break key, _, value = pkt.rstrip(b"\n\r").partition(b"=") final_headers[key] = value # Check final status (if provided, it overrides the initial status) final_status = final_headers.get(b"status", status) if final_status != b"success": final_status_str = final_status.decode("utf-8", errors="replace") raise FilterError( f"Process filter {operation} failed with final status: {final_status_str}" ) return b"".join(result_chunks) except (OSError, subprocess.SubprocessError, ValueError) as e: # Clean up broken process self.cleanup() raise FilterError(f"Process filter failed: {e}") def clean(self, data: bytes) -> bytes: """Apply clean filter using external process.""" import os # Try process filter first (much faster) if self.process_cmd: try: return self._use_process_filter(data, "clean") except FilterError as e: if self.required: raise logging.warning("Process filter failed, falling back: %s", e) # Fall back to clean command if not self.clean_cmd: if self.required: raise FilterError("Clean command is required but not configured") return data # Parse command into list of arguments # Use shlex.split for proper handling of quoted arguments # On Windows, shlex needs posix=False for correct parsing cmd_args = shlex.split(self.clean_cmd, posix=(os.name != "nt")) try: result = subprocess.run( cmd_args, shell=False, input=data, capture_output=True, check=True, cwd=self.cwd, ) return result.stdout except subprocess.CalledProcessError as e: if self.required: raise FilterError(f"Required clean filter failed: {e}") # If not required, log warning and return original data on failure logging.warning("Optional clean filter failed: %s", e) return data def smudge(self, data: bytes, path: bytes = b"") -> bytes: """Apply smudge filter using external process.""" import os path_str = path.decode("utf-8", errors="replace") # Try process filter first (much faster) if self.process_cmd: try: return self._use_process_filter(data, "smudge", path_str) except FilterError as e: if self.required: raise logging.warning("Process filter failed, falling back: %s", e) # Fall back to smudge command if not self.smudge_cmd: if self.required: raise FilterError("Smudge command is required but not configured") return data # Parse command into list of arguments and substitute %f placeholder # Use shlex.split for proper handling of quoted arguments # On Windows, shlex needs posix=False for correct parsing cmd_args = shlex.split(self.smudge_cmd, posix=(os.name != "nt")) # Replace %f placeholder with actual path cmd_args = [arg.replace("%f", path_str) for arg in cmd_args] try: result = subprocess.run( cmd_args, shell=False, input=data, capture_output=True, check=True, cwd=self.cwd, ) return result.stdout except subprocess.CalledProcessError as e: if self.required: raise FilterError( f"Required smudge filter failed: {e} {e.stderr} {e.stdout}" ) # If not required, log warning and return original data on failure logging.warning("Optional smudge filter failed: %s", e) return data def cleanup(self) -> None: """Clean up the process filter.""" if self._process: # Close stdin first to signal the process to quit cleanly if self._process.stdin and not self._process.stdin.closed: try: self._process.stdin.close() except BrokenPipeError: pass # Try to terminate gracefully first if self._process.poll() is None: # Still running try: self._process.terminate() self._process.wait(timeout=2) except subprocess.TimeoutExpired: # Force kill if terminate didn't work try: self._process.kill() self._process.wait(timeout=3) except subprocess.TimeoutExpired: # On Windows, sometimes we need to be more aggressive import os if os.name == "nt": try: subprocess.run( [ "taskkill", "/F", "/T", "/PID", str(self._process.pid), ], capture_output=True, timeout=5, ) self._process.wait(timeout=1) except ( subprocess.CalledProcessError, subprocess.TimeoutExpired, ): pass else: try: import signal os.kill(self._process.pid, signal.SIGKILL) # type: ignore[attr-defined,unused-ignore] self._process.wait(timeout=1) except (ProcessLookupError, subprocess.TimeoutExpired): pass except ProcessLookupError: # Process already dead pass # Close stdout and stderr to prevent resource leaks if self._process.stdout and not self._process.stdout.closed: try: self._process.stdout.close() except (OSError, ValueError): # OSError: I/O operation on closed file # ValueError: I/O operation on closed file (some platforms) pass if self._process.stderr and not self._process.stderr.closed: try: self._process.stderr.close() except (OSError, ValueError): pass self._process = None self._protocol = None def reuse(self, config: "StackedConfig", filter_name: str) -> bool: """Check if this filter driver should be reused with the given configuration.""" # Only reuse if it's a long-running process filter AND config hasn't changed if self.process_cmd is None: # Not a long-running filter, don't cache return False # Check if the filter commands in config match our current commands try: clean_cmd_raw = config.get(("filter", filter_name), "clean") except KeyError: clean_cmd = None else: clean_cmd = ( clean_cmd_raw.decode("utf-8") if isinstance(clean_cmd_raw, bytes) else clean_cmd_raw ) if clean_cmd != self.clean_cmd: return False try: smudge_cmd_raw = config.get(("filter", filter_name), "smudge") except KeyError: smudge_cmd = None else: smudge_cmd = ( smudge_cmd_raw.decode("utf-8") if isinstance(smudge_cmd_raw, bytes) else smudge_cmd_raw ) if smudge_cmd != self.smudge_cmd: return False try: process_cmd_raw = config.get(("filter", filter_name), "process") except KeyError: process_cmd = None else: process_cmd = ( process_cmd_raw.decode("utf-8") if isinstance(process_cmd_raw, bytes) else process_cmd_raw ) if process_cmd != self.process_cmd: return False required = config.get_boolean(("filter", filter_name), "required", False) if required != self.required: return False return True def __del__(self) -> None: """Clean up the process filter on destruction.""" self.cleanup() class FilterContext: """Context for managing stateful filter resources. This class manages the runtime state for filters, including: - Cached filter driver instances that maintain long-running state - Resource lifecycle management It works in conjunction with FilterRegistry to provide complete filter functionality while maintaining proper separation of concerns. """ def __init__(self, filter_registry: "FilterRegistry") -> None: """Initialize FilterContext. Args: filter_registry: The filter registry to use for driver lookups """ self.filter_registry = filter_registry self._active_drivers: dict[str, FilterDriver] = {} def get_driver(self, name: str) -> FilterDriver | None: """Get a filter driver by name, managing stateful instances. This method handles driver instantiation and caching. Only drivers that should be reused are cached. Args: name: The filter name Returns: FilterDriver instance or None """ driver: FilterDriver | None = None # Check if we have a cached instance that should be reused if name in self._active_drivers: driver = self._active_drivers[name] # Check if the cached driver should still be reused if self.filter_registry.config and driver.reuse( self.filter_registry.config, name ): return driver else: # Driver shouldn't be reused, clean it up and remove from cache driver.cleanup() del self._active_drivers[name] # Get driver from registry driver = self.filter_registry.get_driver(name) if driver is not None and self.filter_registry.config: # Only cache drivers that should be reused if driver.reuse(self.filter_registry.config, name): self._active_drivers[name] = driver return driver def close(self) -> None: """Close all active filter resources.""" # Clean up active drivers for driver in self._active_drivers.values(): driver.cleanup() self._active_drivers.clear() # Also close the registry self.filter_registry.close() def refresh_config(self, config: "StackedConfig") -> None: """Refresh the configuration used by the filter registry. This should be called when the configuration has changed to ensure filters use the latest settings. Args: config: The new configuration stack """ # Update the registry's config self.filter_registry.config = config # Re-setup line ending filter with new config # This will update the text filter factory to use new autocrlf settings self.filter_registry._setup_line_ending_filter() # The get_driver method will now handle checking reuse() for cached drivers def __del__(self) -> None: """Clean up on destruction.""" try: self.close() except Exception: # Don't raise exceptions in __del__ pass class FilterRegistry: """Registry for filter drivers.""" def __init__( self, config: "StackedConfig | None" = None, repo: "BaseRepo | None" = None, ) -> None: """Initialize FilterRegistry. Args: config: Git configuration stack repo: Repository instance """ self.config = config self.repo = repo self._drivers: dict[str, FilterDriver] = {} self._factories: dict[str, Callable[[FilterRegistry], FilterDriver]] = {} # Register built-in filter factories self.register_factory("lfs", self._create_lfs_filter) self.register_factory("text", self._create_text_filter) # Auto-register line ending filter if autocrlf is enabled self._setup_line_ending_filter() def register_factory( self, name: str, factory: Callable[["FilterRegistry"], FilterDriver] ) -> None: """Register a filter driver factory.""" self._factories[name] = factory def register_driver(self, name: str, driver: FilterDriver) -> None: """Register a filter driver instance.""" self._drivers[name] = driver def get_driver(self, name: str) -> FilterDriver | None: """Get a filter driver by name.""" # Check if we already have an instance if name in self._drivers: return self._drivers[name] # Try to create from config first (respect user configuration) if self.config is not None: config_driver = self._create_from_config(name) if config_driver is not None: self._drivers[name] = config_driver return config_driver # Try to create from factory as fallback if name in self._factories: factory_driver = self._factories[name](self) self._drivers[name] = factory_driver return factory_driver return None def close(self) -> None: """Close all filter drivers, ensuring process cleanup.""" for driver in self._drivers.values(): driver.cleanup() self._drivers.clear() def __del__(self) -> None: """Clean up filter drivers on destruction.""" try: self.close() except Exception: # Don't raise exceptions in __del__ pass def _create_from_config(self, name: str) -> FilterDriver | None: """Create a filter driver from config.""" if self.config is None: return None clean_cmd: str | None = None smudge_cmd: str | None = None process_cmd: str | None = None # Get process command (preferred over clean/smudge for performance) try: process_cmd_raw = self.config.get(("filter", name), "process") except KeyError: pass else: if isinstance(process_cmd_raw, bytes): process_cmd = process_cmd_raw.decode("utf-8") else: process_cmd = process_cmd_raw # Get clean command try: clean_cmd_raw = self.config.get(("filter", name), "clean") except KeyError: pass else: if isinstance(clean_cmd_raw, bytes): clean_cmd = clean_cmd_raw.decode("utf-8") else: clean_cmd = clean_cmd_raw # Get smudge command try: smudge_cmd_raw = self.config.get(("filter", name), "smudge") except KeyError: pass else: if isinstance(smudge_cmd_raw, bytes): smudge_cmd = smudge_cmd_raw.decode("utf-8") else: smudge_cmd = smudge_cmd_raw # Get required flag (defaults to False) required = self.config.get_boolean(("filter", name), "required", False) if process_cmd or clean_cmd or smudge_cmd: # Get repository working directory (only for Repo, not BaseRepo) from .repo import Repo repo_path = ( self.repo.path if self.repo and isinstance(self.repo, Repo) else None ) return ProcessFilterDriver( clean_cmd, smudge_cmd, required, repo_path, process_cmd ) return None def _create_lfs_filter(self, registry: "FilterRegistry") -> FilterDriver: """Create LFS filter driver.""" from .lfs import LFSFilterDriver, LFSStore # If we have a Repo (not just BaseRepo), use its LFS store from .repo import Repo if registry.repo is not None and isinstance(registry.repo, Repo): lfs_store = LFSStore.from_repo(registry.repo, create=True) else: # Fall back to creating a temporary LFS store import tempfile lfs_dir = tempfile.mkdtemp(prefix="dulwich-lfs-") lfs_store = LFSStore.create(lfs_dir) config = registry.repo.get_config_stack() if registry.repo else None return LFSFilterDriver(lfs_store, config=config) def _create_text_filter(self, registry: "FilterRegistry") -> FilterDriver: """Create text filter driver for line ending conversion. This filter is used when files have the 'text' attribute set explicitly. It always normalizes line endings on checkin (CRLF -> LF). """ from .line_ending import LineEndingFilter return LineEndingFilter.from_config(self.config, for_text_attr=True) def _setup_line_ending_filter(self) -> None: """Automatically register line ending filter if configured.""" if self.config is None: return # Parse autocrlf as bytes try: autocrlf_raw = self.config.get("core", "autocrlf") except KeyError: return else: autocrlf: bytes = ( autocrlf_raw.lower() if isinstance(autocrlf_raw, bytes) else str(autocrlf_raw).lower().encode("ascii") ) # If autocrlf is enabled, register the text filter if autocrlf in (b"true", b"input"): # Pre-create the text filter so it's available self.get_driver("text") def get_filter_for_path( path: bytes, gitattributes: "GitAttributes", filter_registry: FilterRegistry | None = None, filter_context: FilterContext | None = None, ) -> FilterDriver | None: """Get the appropriate filter driver for a given path. Args: path: Path to check gitattributes: GitAttributes object with parsed patterns filter_registry: Registry of filter drivers (deprecated, use filter_context) filter_context: Context for managing filter state Returns: FilterDriver instance or None """ # Use filter_context if provided, otherwise fall back to registry if filter_context is not None: registry = filter_context.filter_registry get_driver = filter_context.get_driver elif filter_registry is not None: registry = filter_registry get_driver = filter_registry.get_driver else: raise ValueError("Either filter_registry or filter_context must be provided") # Get all attributes for this path attributes = gitattributes.match_path(path) # Collect filters to apply filters: list[FilterDriver] = [] # Check for text attribute first (it should be applied before custom filters) text_attr = attributes.get(b"text") if text_attr is True: # Add text filter for line ending conversion text_filter = get_driver("text") if text_filter is not None: filters.append(text_filter) elif text_attr is False: # -text means binary, no conversion - but still check for custom filters pass else: # If no explicit text attribute, check if autocrlf is enabled # When autocrlf is true/input, files are treated as text by default if registry.config is not None: try: autocrlf_raw = registry.config.get("core", "autocrlf") except KeyError: pass else: autocrlf: bytes = ( autocrlf_raw.lower() if isinstance(autocrlf_raw, bytes) else str(autocrlf_raw).lower().encode("ascii") ) if autocrlf in (b"true", b"input"): # Add text filter for files without explicit attributes text_filter = get_driver("text") if text_filter is not None: filters.append(text_filter) # Check if there's a filter attribute filter_name = attributes.get(b"filter") if filter_name is not None and not isinstance(filter_name, bool): if isinstance(filter_name, bytes): filter_name_str = filter_name.decode("utf-8") driver = get_driver(filter_name_str) # Check if filter is required but missing if driver is None and registry.config is not None: required = registry.config.get_boolean( ("filter", filter_name_str), "required", False ) if required: raise FilterError( f"Required filter '{filter_name_str}' is not available" ) if driver is not None: filters.append(driver) # Return appropriate filter(s) if len(filters) == 0: return None elif len(filters) == 1: return filters[0] else: # Multiple filters - create a composite return CompositeFilterDriver(filters) class FilterBlobNormalizer: """Blob normalizer that applies clean/smudge filters based on gitattributes. This can be used in addition to or instead of line ending normalization. """ def __init__( self, config_stack: "StackedConfig | None", gitattributes: GitAttributes, filter_registry: FilterRegistry | None = None, repo: "BaseRepo | None" = None, filter_context: FilterContext | None = None, ) -> None: """Initialize FilterBlobNormalizer. Args: config_stack: Git configuration stack gitattributes: GitAttributes instance filter_registry: Optional filter registry to use (deprecated, use filter_context) repo: Optional repository instance filter_context: Optional filter context to use for managing filter state """ self.config_stack = config_stack self.gitattributes = gitattributes self._owns_context = False # Track if we created our own context # Support both old and new API if filter_context is not None: self.filter_context = filter_context self.filter_registry = filter_context.filter_registry self._owns_context = False # We're using an external context else: if filter_registry is not None: import warnings warnings.warn( "Passing filter_registry to FilterBlobNormalizer is deprecated. " "Pass a FilterContext instead.", DeprecationWarning, stacklevel=2, ) self.filter_registry = filter_registry else: self.filter_registry = FilterRegistry(config_stack, repo) self.filter_context = FilterContext(self.filter_registry) self._owns_context = True # We created our own context def checkin_normalize(self, blob: Blob, path: bytes) -> Blob: """Apply clean filter during checkin (working tree -> repository).""" # Get filter for this path filter_driver = get_filter_for_path( path, self.gitattributes, filter_context=self.filter_context ) if filter_driver is None: return blob # Apply clean filter filtered_data = filter_driver.clean(blob.data) if filtered_data == blob.data: return blob # Create new blob with filtered data new_blob = Blob() new_blob.data = filtered_data return new_blob def checkout_normalize(self, blob: Blob, path: bytes) -> Blob: """Apply smudge filter during checkout (repository -> working tree).""" # Get filter for this path filter_driver = get_filter_for_path( path, self.gitattributes, filter_context=self.filter_context ) if filter_driver is None: return blob # Apply smudge filter filtered_data = filter_driver.smudge(blob.data, path) if filtered_data == blob.data: return blob # Create new blob with filtered data new_blob = Blob() new_blob.data = filtered_data return new_blob def close(self) -> None: """Close all filter drivers, ensuring process cleanup.""" # Only close the filter context if we created it ourselves if self._owns_context: self.filter_context.close() def __del__(self) -> None: """Clean up filter drivers on destruction.""" try: self.close() except Exception: # Don't raise exceptions in __del__ pass dulwich-1.0.0/dulwich/gc.py000066400000000000000000000351471513301442600155620ustar00rootroot00000000000000# gc.py -- Git garbage collection implementation # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git garbage collection implementation.""" __all__ = [ "DEFAULT_GC_AUTO", "DEFAULT_GC_AUTO_PACK_LIMIT", "GCStats", "find_reachable_objects", "find_unreachable_objects", "garbage_collect", "maybe_auto_gc", "prune_unreachable_objects", "should_run_gc", ] import logging import os import time from collections import deque from collections.abc import Callable from dataclasses import dataclass, field from typing import TYPE_CHECKING from dulwich.object_store import ( BaseObjectStore, DiskObjectStore, ) from dulwich.objects import Commit, ObjectID, Tag, Tree from dulwich.refs import RefsContainer if TYPE_CHECKING: from .config import Config from .repo import BaseRepo, Repo DEFAULT_GC_AUTO = 6700 DEFAULT_GC_AUTO_PACK_LIMIT = 50 @dataclass class GCStats: """Statistics from garbage collection.""" pruned_objects: set[ObjectID] = field(default_factory=set) bytes_freed: int = 0 packs_before: int = 0 packs_after: int = 0 loose_objects_before: int = 0 loose_objects_after: int = 0 def find_reachable_objects( object_store: BaseObjectStore, refs_container: RefsContainer, include_reflogs: bool = True, progress: Callable[[str], None] | None = None, ) -> set[ObjectID]: """Find all reachable objects in the repository. Args: object_store: Object store to search refs_container: Reference container include_reflogs: Whether to include reflog entries progress: Optional progress callback Returns: Set of reachable object SHAs """ reachable: set[ObjectID] = set() pending: deque[ObjectID] = deque() # Start with all refs for ref in refs_container.allkeys(): try: sha = refs_container[ref] # This follows symbolic refs if sha and sha not in reachable: pending.append(sha) reachable.add(sha) except KeyError: # Broken ref if progress: progress(f"Warning: Broken ref {ref.decode('utf-8', 'replace')}") continue # TODO: Add reflog support when reflog functionality is available # Walk all reachable objects while pending: sha = pending.popleft() if progress: progress(f"Checking object {sha.decode('ascii', 'replace')}") try: obj = object_store[sha] except KeyError: continue # Add referenced objects if isinstance(obj, Commit): # Tree if obj.tree not in reachable: pending.append(obj.tree) reachable.add(obj.tree) # Parents for parent in obj.parents: if parent not in reachable: pending.append(parent) reachable.add(parent) elif isinstance(obj, Tree): # Tree entries for entry in obj.items(): assert entry.sha is not None if entry.sha not in reachable: pending.append(entry.sha) reachable.add(entry.sha) elif isinstance(obj, Tag): # Tagged object if obj.object[1] not in reachable: pending.append(obj.object[1]) reachable.add(obj.object[1]) return reachable def find_unreachable_objects( object_store: BaseObjectStore, refs_container: RefsContainer, include_reflogs: bool = True, progress: Callable[[str], None] | None = None, ) -> set[ObjectID]: """Find all unreachable objects in the repository. Args: object_store: Object store to search refs_container: Reference container include_reflogs: Whether to include reflog entries progress: Optional progress callback Returns: Set of unreachable object SHAs """ reachable = find_reachable_objects( object_store, refs_container, include_reflogs, progress ) unreachable: set[ObjectID] = set() for sha in object_store: if sha not in reachable: unreachable.add(sha) return unreachable def prune_unreachable_objects( object_store: DiskObjectStore, refs_container: RefsContainer, grace_period: int | None = None, dry_run: bool = False, progress: Callable[[str], None] | None = None, ) -> tuple[set[ObjectID], int]: """Remove unreachable objects from the repository. Args: object_store: Object store to prune refs_container: Reference container grace_period: Grace period in seconds (objects newer than this are kept) dry_run: If True, only report what would be deleted progress: Optional progress callback Returns: Tuple of (set of pruned object SHAs, total bytes freed) """ unreachable = find_unreachable_objects( object_store, refs_container, progress=progress ) pruned: set[ObjectID] = set() bytes_freed = 0 for sha in unreachable: try: obj = object_store[sha] # Check grace period if grace_period is not None: try: mtime = object_store.get_object_mtime(sha) age = time.time() - mtime if age < grace_period: if progress: progress( f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" ) continue except KeyError: # Object not found, skip it continue if progress: progress(f"Pruning {sha.decode('ascii', 'replace')}") # Calculate size before attempting deletion obj_size = len(obj.as_raw_string()) if not dry_run: object_store.delete_loose_object(sha) # Only count as pruned if we get here (deletion succeeded or dry run) pruned.add(sha) bytes_freed += obj_size except KeyError: # Object already gone pass except OSError as e: # File system errors during deletion if progress: progress(f"Error pruning {sha.decode('ascii', 'replace')}: {e}") return pruned, bytes_freed def garbage_collect( repo: "Repo", auto: bool = False, aggressive: bool = False, prune: bool = True, grace_period: int | None = 1209600, # 2 weeks default dry_run: bool = False, progress: Callable[[str], None] | None = None, ) -> GCStats: """Run garbage collection on a repository. Args: repo: Repository to garbage collect auto: Whether this is an automatic gc aggressive: Whether to use aggressive settings prune: Whether to prune unreachable objects grace_period: Grace period for pruning in seconds dry_run: If True, only report what would be done progress: Optional progress callback Returns: GCStats object with garbage collection statistics """ stats = GCStats() object_store = repo.object_store refs_container = repo.refs # Count initial state stats.packs_before = len(list(object_store.packs)) stats.loose_objects_before = object_store.count_loose_objects() # Find unreachable objects to exclude from repacking unreachable_to_prune = set() if prune: if progress: progress("Finding unreachable objects") unreachable = find_unreachable_objects( object_store, refs_container, progress=progress ) # Apply grace period check for sha in unreachable: try: if grace_period is not None: try: mtime = object_store.get_object_mtime(sha) age = time.time() - mtime if age < grace_period: if progress: progress( f"Keeping {sha.decode('ascii', 'replace')} (age: {age:.0f}s < grace period: {grace_period}s)" ) continue except KeyError: # Object not found, skip it continue unreachable_to_prune.add(sha) obj = object_store[sha] stats.bytes_freed += len(obj.as_raw_string()) except KeyError: pass stats.pruned_objects = unreachable_to_prune # Pack refs if progress: progress("Packing references") if not dry_run: repo.refs.pack_refs() # Delete loose unreachable objects if prune and not dry_run: for sha in unreachable_to_prune: if object_store.contains_loose(sha): try: object_store.delete_loose_object(sha) except OSError: pass # Repack everything, excluding unreachable objects # This handles both loose object packing and pack consolidation if progress: progress("Repacking repository") if not dry_run: if prune and unreachable_to_prune: # Repack excluding unreachable objects object_store.repack(exclude=unreachable_to_prune, progress=progress) else: # Normal repack object_store.repack(progress=progress) # Prune orphaned temporary files if progress: progress("Pruning temporary files") if not dry_run: object_store.prune(grace_period=grace_period) # Count final state stats.packs_after = len(list(object_store.packs)) stats.loose_objects_after = object_store.count_loose_objects() return stats def should_run_gc(repo: "BaseRepo", config: "Config | None" = None) -> bool: """Check if automatic garbage collection should run. Args: repo: Repository to check config: Configuration to use (defaults to repo config) Returns: True if GC should run, False otherwise """ # Check environment variable first if os.environ.get("GIT_AUTO_GC") == "0": return False # Check programmatic disable flag if getattr(repo, "_autogc_disabled", False): return False if config is None: config = repo.get_config() # Check if auto GC is disabled try: gc_auto = config.get(b"gc", b"auto") gc_auto_value = int(gc_auto) except KeyError: gc_auto_value = DEFAULT_GC_AUTO if gc_auto_value == 0: # Auto GC is disabled return False # Check loose object count object_store = repo.object_store if not isinstance(object_store, DiskObjectStore): # Can't count loose objects on non-disk stores return False loose_count = object_store.count_loose_objects() if loose_count >= gc_auto_value: return True # Check pack file count try: gc_auto_pack_limit = config.get(b"gc", b"autoPackLimit") pack_limit = int(gc_auto_pack_limit) except KeyError: pack_limit = DEFAULT_GC_AUTO_PACK_LIMIT if pack_limit > 0: pack_count = object_store.count_pack_files() if pack_count >= pack_limit: return True return False def maybe_auto_gc( repo: "Repo", config: "Config | None" = None, progress: Callable[[str], None] | None = None, ) -> bool: """Run automatic garbage collection if needed. Args: repo: Repository to potentially GC config: Configuration to use (defaults to repo config) progress: Optional progress reporting callback Returns: True if GC was run, False otherwise """ if not should_run_gc(repo, config): return False # Check for gc.log file - only for disk-based repos if not hasattr(repo, "controldir"): # For non-disk repos, just run GC without gc.log handling garbage_collect(repo, auto=True, progress=progress) return True gc_log_path = os.path.join(repo.controldir(), "gc.log") if os.path.exists(gc_log_path): # Check gc.logExpiry if config is None: config = repo.get_config() try: log_expiry = config.get(b"gc", b"logExpiry") except KeyError: # Default to 1 day expiry_seconds = 86400 else: # Parse time value (simplified - just support days for now) if log_expiry.endswith((b".days", b".day")): days = int(log_expiry.split(b".")[0]) expiry_seconds = days * 86400 else: # Default to 1 day expiry_seconds = 86400 stat_info = os.stat(gc_log_path) if time.time() - stat_info.st_mtime < expiry_seconds: # gc.log exists and is not expired - skip GC with open(gc_log_path, "rb") as f: logging.info( "gc.log content: %s", f.read().decode("utf-8", errors="replace") ) return False # TODO: Support gc.autoDetach to run in background # For now, run in foreground try: # Run GC with auto=True flag garbage_collect(repo, auto=True, progress=progress) # Remove gc.log on successful completion if os.path.exists(gc_log_path): try: os.unlink(gc_log_path) except FileNotFoundError: pass return True except OSError as e: # Write error to gc.log with open(gc_log_path, "wb") as f: f.write(f"Auto GC failed: {e}\n".encode()) # Don't propagate the error - auto GC failures shouldn't break operations return False dulwich-1.0.0/dulwich/graph.py000066400000000000000000000320531513301442600162630ustar00rootroot00000000000000# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab # Copyright (c) 2020 Kevin B. Hendricks, Stratford Ontario Canada # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Implementation of merge-base following the approach of git.""" __all__ = [ "WorkList", "can_fast_forward", "find_merge_base", "find_octopus_base", "independent", ] from collections.abc import Callable, Iterator, Mapping, Sequence from heapq import heappop, heappush from typing import TYPE_CHECKING, Generic, TypeVar if TYPE_CHECKING: from .repo import BaseRepo from .lru_cache import LRUCache from .objects import Commit, ObjectID T = TypeVar("T") # priority queue using builtin python minheap tools # why they do not have a builtin maxheap is simply ridiculous but # liveable with integer time stamps using negation class WorkList(Generic[T]): """Priority queue for commit processing using a min-heap.""" def __init__(self) -> None: """Initialize an empty work list.""" self.pq: list[tuple[int, T]] = [] def add(self, item: tuple[int, T]) -> None: """Add an item to the work list. Args: item: Tuple of (timestamp, commit) """ dt, cmt = item heappush(self.pq, (-dt, cmt)) def get(self) -> tuple[int, T] | None: """Get the highest priority item from the work list. Returns: Tuple of (timestamp, commit) or None if empty """ item = heappop(self.pq) if item: pr, cmt = item return -pr, cmt return None def iter(self) -> Iterator[tuple[int, T]]: """Iterate over items in the work list. Yields: Tuples of (timestamp, commit) """ for pr, cmt in self.pq: yield (-pr, cmt) def _find_lcas( lookup_parents: Callable[[ObjectID], list[ObjectID]], c1: ObjectID, c2s: Sequence[ObjectID], lookup_stamp: Callable[[ObjectID], int], min_stamp: int = 0, shallows: set[ObjectID] | None = None, ) -> list[ObjectID]: """Find lowest common ancestors between commits. Args: lookup_parents: Function to get parent commits c1: First commit c2s: List of second commits lookup_stamp: Function to get commit timestamp min_stamp: Minimum timestamp to consider shallows: Set of shallow commits Returns: List of lowest common ancestor commit IDs """ cands = [] cstates: dict[ObjectID, int] = {} # Flags to Record State _ANC_OF_1 = 1 # ancestor of commit 1 _ANC_OF_2 = 2 # ancestor of commit 2 _DNC = 4 # Do Not Consider _LCA = 8 # potential LCA (Lowest Common Ancestor) def _has_candidates( wlst: WorkList[ObjectID], cstates: Mapping[ObjectID, int] ) -> bool: """Check if there are any candidate commits in the work list. Args: wlst: Work list of commits cstates: Dictionary of commit states Returns: True if there are candidates to process """ for dt, cmt in wlst.iter(): if cmt in cstates: if not ((cstates[cmt] & _DNC) == _DNC): return True return False # initialize the working list states with ancestry info # note possibility of c1 being one of c2s should be handled wlst: WorkList[ObjectID] = WorkList() cstates[c1] = _ANC_OF_1 try: wlst.add((lookup_stamp(c1), c1)) except KeyError: # If c1 doesn't exist and we have shallow commits, it might be a missing parent if shallows is None or not shallows: raise # For missing commits in shallow repos, use a minimal timestamp wlst.add((0, c1)) for c2 in c2s: cflags = cstates.get(c2, 0) cstates[c2] = cflags | _ANC_OF_2 try: wlst.add((lookup_stamp(c2), c2)) except KeyError: # If c2 doesn't exist and we have shallow commits, it might be a missing parent if shallows is None or not shallows: raise # For missing commits in shallow repos, use a minimal timestamp wlst.add((0, c2)) # loop while at least one working list commit is still viable (not marked as _DNC) # adding any parents to the list in a breadth first manner while _has_candidates(wlst, cstates): result = wlst.get() if result is None: break dt, cmt = result # Look only at ANCESTRY and _DNC flags so that already # found _LCAs can still be marked _DNC by lower _LCAS cflags = cstates[cmt] & (_ANC_OF_1 | _ANC_OF_2 | _DNC) if cflags == (_ANC_OF_1 | _ANC_OF_2): # potential common ancestor if not already in candidates add it if not (cstates[cmt] & _LCA) == _LCA: cstates[cmt] = cstates[cmt] | _LCA cands.append((dt, cmt)) # mark any parents of this node _DNC as all parents # would be one generation further removed common ancestors cflags = cflags | _DNC try: parents = lookup_parents(cmt) except KeyError: # If we can't get parents in a shallow repo, skip this node # This is safer than pretending it has no parents if shallows is not None and shallows: continue raise if parents: for pcmt in parents: pflags = cstates.get(pcmt, 0) # if this parent was already visited with no new ancestry/flag information # do not add it to the working list again if (pflags & cflags) == cflags: continue try: pdt = lookup_stamp(pcmt) except KeyError: # Parent doesn't exist - if we're in a shallow repo, skip it if shallows is not None and shallows: continue raise if pdt < min_stamp: continue cstates[pcmt] = pflags | cflags wlst.add((pdt, pcmt)) # walk final candidates removing any superseded by _DNC by later lower _LCAs # remove any duplicates and sort it so that earliest is first results = [] for dt, cmt in cands: if not ((cstates[cmt] & _DNC) == _DNC) and (dt, cmt) not in results: results.append((dt, cmt)) results.sort(key=lambda x: x[0]) lcas = [cmt for dt, cmt in results] return lcas # actual git sorts these based on commit times def find_merge_base(repo: "BaseRepo", commit_ids: Sequence[ObjectID]) -> list[ObjectID]: """Find lowest common ancestors of commit_ids[0] and *any* of commits_ids[1:]. Args: repo: Repository object commit_ids: list of commit ids Returns: list of lowest common ancestor commit_ids """ cmtcache: LRUCache[ObjectID, Commit] = LRUCache(max_cache=128) parents_provider = repo.parents_provider() def lookup_stamp(cmtid: ObjectID) -> int: if cmtid not in cmtcache: obj = repo.object_store[cmtid] assert isinstance(obj, Commit) cmtcache[cmtid] = obj commit_time = cmtcache[cmtid].commit_time assert isinstance(commit_time, int) return commit_time def lookup_parents(cmtid: ObjectID) -> list[ObjectID]: commit = None if cmtid in cmtcache: commit = cmtcache[cmtid] # must use parents provider to handle grafts and shallow return parents_provider.get_parents(cmtid, commit=commit) if not commit_ids: return [] c1 = commit_ids[0] if not len(commit_ids) > 1: return [c1] c2s = list(commit_ids[1:]) if c1 in c2s: return [c1] lcas = _find_lcas( lookup_parents, c1, c2s, lookup_stamp, shallows=parents_provider.shallows ) return lcas def find_octopus_base( repo: "BaseRepo", commit_ids: Sequence[ObjectID] ) -> list[ObjectID]: """Find lowest common ancestors of *all* provided commit_ids. Args: repo: Repository commit_ids: list of commit ids Returns: list of lowest common ancestor commit_ids """ cmtcache: LRUCache[ObjectID, Commit] = LRUCache(max_cache=128) parents_provider = repo.parents_provider() def lookup_stamp(cmtid: ObjectID) -> int: if cmtid not in cmtcache: obj = repo.object_store[cmtid] assert isinstance(obj, Commit) cmtcache[cmtid] = obj commit_time = cmtcache[cmtid].commit_time assert isinstance(commit_time, int) return commit_time def lookup_parents(cmtid: ObjectID) -> list[ObjectID]: commit = None if cmtid in cmtcache: commit = cmtcache[cmtid] # must use parents provider to handle grafts and shallow return parents_provider.get_parents(cmtid, commit=commit) if not commit_ids: return [] if len(commit_ids) <= 2: return find_merge_base(repo, commit_ids) lcas = [commit_ids[0]] others = commit_ids[1:] for cmt in others: next_lcas = [] for ca in lcas: res = _find_lcas( lookup_parents, cmt, [ca], lookup_stamp, shallows=parents_provider.shallows, ) next_lcas.extend(res) lcas = next_lcas[:] return lcas def can_fast_forward(repo: "BaseRepo", c1: ObjectID, c2: ObjectID) -> bool: """Is it possible to fast-forward from c1 to c2? Args: repo: Repository to retrieve objects from c1: Commit id for first commit c2: Commit id for second commit """ cmtcache: LRUCache[ObjectID, Commit] = LRUCache(max_cache=128) parents_provider = repo.parents_provider() def lookup_stamp(cmtid: ObjectID) -> int: if cmtid not in cmtcache: obj = repo.object_store[cmtid] assert isinstance(obj, Commit) cmtcache[cmtid] = obj commit_time = cmtcache[cmtid].commit_time assert isinstance(commit_time, int) return commit_time def lookup_parents(cmtid: ObjectID) -> list[ObjectID]: commit = None if cmtid in cmtcache: commit = cmtcache[cmtid] # must use parents provider to handle grafts and shallow return parents_provider.get_parents(cmtid, commit=commit) if c1 == c2: return True # Algorithm: Find the common ancestor try: min_stamp = lookup_stamp(c1) except KeyError: # If c1 doesn't exist in the object store, we can't determine fast-forward # This can happen in shallow clones where c1 is a missing parent # Check if any shallow commits have c1 as a parent if parents_provider.shallows: # We're in a shallow repository and c1 doesn't exist # We can't determine if fast-forward is possible return False raise lcas = _find_lcas( lookup_parents, c1, [c2], lookup_stamp, min_stamp=min_stamp, shallows=parents_provider.shallows, ) return lcas == [c1] def independent(repo: "BaseRepo", commit_ids: Sequence[ObjectID]) -> list[ObjectID]: """Filter commits to only those that are not reachable from others. Args: repo: Repository object commit_ids: list of commit ids to filter Returns: list of commit ids that are not ancestors of any other commits in the list """ if not commit_ids: return [] if len(commit_ids) == 1: return list(commit_ids) # Filter out commits that are ancestors of other commits independent_commits = [] for i, commit_id in enumerate(commit_ids): is_independent = True # Check if this commit is an ancestor of any other commit for j, other_id in enumerate(commit_ids): if i == j: continue # If merge base of (commit_id, other_id) is commit_id, # then commit_id is an ancestor of other_id merge_bases = find_merge_base(repo, [commit_id, other_id]) if merge_bases == [commit_id]: is_independent = False break if is_independent: independent_commits.append(commit_id) return independent_commits dulwich-1.0.0/dulwich/hooks.py000066400000000000000000000175151513301442600163130ustar00rootroot00000000000000# hooks.py -- for dealing with git hooks # Copyright (C) 2012-2013 Jelmer Vernooij and others. # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Access to hooks.""" __all__ = [ "CommitMsgShellHook", "Hook", "PostCommitShellHook", "PostReceiveShellHook", "PreCommitShellHook", "ShellHook", ] import os import subprocess from collections.abc import Callable, Sequence from typing import Any from .errors import HookError class Hook: """Generic hook object.""" def execute(self, *args: Any) -> Any: # noqa: ANN401 """Execute the hook with the given args. Args: args: argument list to hook Raises: HookError: hook execution failure Returns: a hook may return a useful value """ raise NotImplementedError(self.execute) class ShellHook(Hook): """Hook by executable file. Implements standard githooks(5) [0]: [0] http://www.kernel.org/pub/software/scm/git/docs/githooks.html """ def __init__( self, name: str, path: str, numparam: int, pre_exec_callback: Callable[..., Any] | None = None, post_exec_callback: Callable[..., Any] | None = None, cwd: str | None = None, ) -> None: """Setup shell hook definition. Args: name: name of hook for error messages path: absolute path to executable file numparam: number of requirements parameters pre_exec_callback: closure for setup before execution Defaults to None. Takes in the variable argument list from the execute functions and returns a modified argument list for the shell hook. post_exec_callback: closure for cleanup after execution Defaults to None. Takes in a boolean for hook success and the modified argument list and returns the final hook return value if applicable cwd: working directory to switch to when executing the hook """ self.name = name self.filepath = path self.numparam = numparam self.pre_exec_callback = pre_exec_callback self.post_exec_callback = post_exec_callback self.cwd = cwd def execute(self, *args: Any) -> Any: # noqa: ANN401 """Execute the hook with given args.""" if len(args) != self.numparam: raise HookError( f"Hook {self.name} executed with wrong number of args. Expected {self.numparam}. Saw {len(args)}. args: {args}" ) if self.pre_exec_callback is not None: args = self.pre_exec_callback(*args) try: ret = subprocess.call( [os.path.relpath(self.filepath, self.cwd), *list(args)], cwd=self.cwd ) if ret != 0: if self.post_exec_callback is not None: self.post_exec_callback(0, *args) raise HookError(f"Hook {self.name} exited with non-zero status {ret}") if self.post_exec_callback is not None: return self.post_exec_callback(1, *args) except FileNotFoundError: # no file. silent failure. if self.post_exec_callback is not None: self.post_exec_callback(0, *args) class PreCommitShellHook(ShellHook): """pre-commit shell hook.""" def __init__(self, cwd: str, controldir: str) -> None: """Initialize pre-commit hook. Args: cwd: Working directory for hook execution controldir: Path to the git control directory (.git) """ filepath = os.path.join(controldir, "hooks", "pre-commit") ShellHook.__init__(self, "pre-commit", filepath, 0, cwd=cwd) class PostCommitShellHook(ShellHook): """post-commit shell hook.""" def __init__(self, controldir: str) -> None: """Initialize post-commit hook. Args: controldir: Path to the git control directory (.git) """ filepath = os.path.join(controldir, "hooks", "post-commit") ShellHook.__init__(self, "post-commit", filepath, 0, cwd=controldir) class CommitMsgShellHook(ShellHook): """commit-msg shell hook.""" def __init__(self, controldir: str) -> None: """Initialize commit-msg hook. Args: controldir: Path to the git control directory (.git) """ filepath = os.path.join(controldir, "hooks", "commit-msg") def prepare_msg(*args: bytes) -> tuple[str, ...]: import tempfile (fd, path) = tempfile.mkstemp() with os.fdopen(fd, "wb") as f: f.write(args[0]) return (path,) def clean_msg(success: int, *args: str) -> bytes | None: if success: with open(args[0], "rb") as f: new_msg = f.read() os.unlink(args[0]) return new_msg os.unlink(args[0]) return None ShellHook.__init__( self, "commit-msg", filepath, 1, prepare_msg, clean_msg, controldir ) class PostReceiveShellHook(ShellHook): """post-receive shell hook.""" def __init__(self, controldir: str) -> None: """Initialize post-receive hook. Args: controldir: Path to the git control directory (.git) """ self.controldir = controldir filepath = os.path.join(controldir, "hooks", "post-receive") ShellHook.__init__(self, "post-receive", path=filepath, numparam=0) def execute( self, client_refs: Sequence[tuple[bytes, bytes, bytes]] ) -> bytes | None: """Execute the post-receive hook. Args: client_refs: List of tuples containing (old_sha, new_sha, ref_name) for each updated reference Returns: Output from the hook execution or None if hook doesn't exist Raises: HookError: If hook execution fails """ # do nothing if the script doesn't exist if not os.path.exists(self.filepath): return None try: env = os.environ.copy() env["GIT_DIR"] = self.controldir p = subprocess.Popen( self.filepath, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, ) # client_refs is a list of (oldsha, newsha, ref) in_data = b"\n".join([b" ".join(ref) for ref in client_refs]) out_data, err_data = p.communicate(in_data) if (p.returncode != 0) or err_data: err_msg = ( f"post-receive exit code: {p.returncode}\n" f"stdout:\n{out_data.decode('utf-8', 'backslashreplace')}\n" f"stderr:\n{err_data.decode('utf-8', 'backslashreplace')}" ) raise HookError(err_msg) return out_data except OSError as err: raise HookError(repr(err)) from err dulwich-1.0.0/dulwich/ignore.py000066400000000000000000000571301513301442600164500ustar00rootroot00000000000000# Copyright (C) 2017 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Parsing of gitignore files. For details for the matching rules, see https://git-scm.com/docs/gitignore Important: When checking if directories are ignored, include a trailing slash in the path. For example, use "dir/" instead of "dir" to check if a directory is ignored. """ __all__ = [ "IgnoreFilter", "IgnoreFilterManager", "IgnoreFilterStack", "Pattern", "default_user_ignore_filter_path", "match_pattern", "read_ignore_patterns", "translate", ] import os.path import re from collections.abc import Iterable, Sequence from contextlib import suppress from typing import TYPE_CHECKING, BinaryIO if TYPE_CHECKING: from .repo import Repo from .config import Config, get_xdg_config_home_path def _pattern_to_str(pattern: "Pattern | bytes | str") -> str: """Convert a pattern to string, handling both Pattern objects and raw patterns.""" if isinstance(pattern, Pattern): pattern_data: bytes | str = pattern.pattern else: pattern_data = pattern return pattern_data.decode() if isinstance(pattern_data, bytes) else pattern_data def _check_parent_exclusion(path: str, matching_patterns: Sequence["Pattern"]) -> bool: """Check if a parent directory exclusion prevents negation patterns from taking effect. Args: path: Path to check matching_patterns: List of Pattern objects that matched the path Returns: True if parent exclusion applies (negation should be ineffective), False otherwise """ # Find the last negation pattern final_negation = next( (p for p in reversed(matching_patterns) if not p.is_exclude), None ) if not final_negation: return False final_pattern_str = _pattern_to_str(final_negation) # Check if any exclusion pattern excludes a parent directory return any( pattern.is_exclude and _pattern_excludes_parent(_pattern_to_str(pattern), path, final_pattern_str) for pattern in matching_patterns ) def _pattern_excludes_parent( pattern_str: str, path: str, final_pattern_str: str ) -> bool: """Check if a pattern excludes a parent directory of the given path.""" # Handle **/middle/** patterns if pattern_str.startswith("**/") and pattern_str.endswith("/**"): middle = pattern_str[3:-3] return f"/{middle}/" in f"/{path}" or path.startswith(f"{middle}/") # Handle dir/** patterns if pattern_str.endswith("/**") and not pattern_str.startswith("**/"): base_dir = pattern_str[:-3] if not path.startswith(base_dir + "/"): return False remaining = path[len(base_dir) + 1 :] # Special case: dir/** allows immediate child file negations if ( not path.endswith("/") and final_pattern_str.startswith("!") and "/" not in remaining ): neg_pattern = final_pattern_str[1:] if neg_pattern == path or ("*" in neg_pattern and "**" not in neg_pattern): return False # Nested files with ** negation patterns if "**" in final_pattern_str and Pattern(final_pattern_str[1:].encode()).match( path.encode() ): return False return True # Directory patterns (ending with /) can exclude parent directories if pattern_str.endswith("/") and "/" in path: p = Pattern(pattern_str.encode()) parts = path.split("/") return any( p.match(("/".join(parts[:i]) + "/").encode()) for i in range(1, len(parts)) ) return False def _translate_segment(segment: bytes) -> bytes: """Translate a single path segment to regex, following Git rules exactly.""" if segment == b"*": return b"[^/]+" res = b"" i, n = 0, len(segment) while i < n: c = segment[i : i + 1] i += 1 if c == b"*": res += b"[^/]*" elif c == b"?": res += b"[^/]" elif c == b"\\": if i < n: res += re.escape(segment[i : i + 1]) i += 1 else: res += re.escape(c) elif c == b"[": j = i if j < n and segment[j : j + 1] == b"!": j += 1 if j < n and segment[j : j + 1] == b"]": j += 1 while j < n and segment[j : j + 1] != b"]": j += 1 if j >= n: res += b"\\[" else: stuff = segment[i:j].replace(b"\\", b"\\\\") i = j + 1 if stuff.startswith(b"!"): stuff = b"^" + stuff[1:] elif stuff.startswith(b"^"): stuff = b"\\" + stuff res += b"[" + stuff + b"]" else: res += re.escape(c) return res def _handle_double_asterisk(segments: Sequence[bytes], i: int) -> tuple[bytes, bool]: """Handle ** segment processing, returns (regex_part, skip_next).""" # Check if ** is at end remaining = segments[i + 1 :] if all(s == b"" for s in remaining): # ** at end - matches everything return b".*", False # Check if next segment is also ** if i + 1 < len(segments) and segments[i + 1] == b"**": # Consecutive ** segments # Check if this ends with a directory pattern (trailing /) remaining_after_next = segments[i + 2 :] is_dir_pattern = ( len(remaining_after_next) == 1 and remaining_after_next[0] == b"" ) if is_dir_pattern: # Pattern like c/**/**/ - requires at least one intermediate directory return b"[^/]+/(?:[^/]+/)*", True else: # Pattern like c/**/**/d - allows zero intermediate directories return b"(?:[^/]+/)*", True else: # ** in middle - handle differently depending on what follows if i == 0: # ** at start - any prefix return b"(?:.*/)??", False else: # ** in middle - match zero or more complete directory segments return b"(?:[^/]+/)*", False def _handle_leading_patterns(pat: bytes, res: bytes) -> tuple[bytes, bytes]: """Handle leading patterns like ``/**/``, ``**/``, or ``/``.""" if pat.startswith(b"/**/"): # Leading /** is same as ** return pat[4:], b"(.*/)?" elif pat.startswith(b"**/"): # Leading **/ return pat[3:], b"(.*/)?" elif pat.startswith(b"/"): # Leading / means relative to .gitignore location return pat[1:], b"" else: return pat, b"" def translate(pat: bytes) -> bytes: """Translate a gitignore pattern to a regular expression following Git rules exactly.""" res = b"(?ms)" # Check for invalid patterns with // - Git treats these as broken patterns if b"//" in pat: # Pattern with // doesn't match anything in Git return b"(?!.*)" # Negative lookahead - matches nothing # Don't normalize consecutive ** patterns - Git treats them specially # c/**/**/ requires at least one intermediate directory # So we keep the pattern as-is # Handle patterns with no slashes (match at any level) if b"/" not in pat[:-1]: # No slash except possibly at end res += b"(.*/)?" # Handle leading patterns pat, prefix_added = _handle_leading_patterns(pat, res) if prefix_added: res += prefix_added # Process the rest of the pattern if pat == b"**": res += b".*" else: segments = pat.split(b"/") i = 0 while i < len(segments): segment = segments[i] # Add slash separator (except for first segment) if i > 0 and segments[i - 1] != b"**": res += re.escape(b"/") if segment == b"**": regex_part, skip_next = _handle_double_asterisk(segments, i) res += regex_part if regex_part == b".*": # End of pattern break if skip_next: i += 1 else: res += _translate_segment(segment) i += 1 # Add optional trailing slash for files if not pat.endswith(b"/"): res += b"/?" return res + b"\\Z" def read_ignore_patterns(f: BinaryIO) -> Iterable[bytes]: """Read a git ignore file. Args: f: File-like object to read from Returns: List of patterns """ for line in f: line = line.rstrip(b"\r\n") # Ignore blank lines, they're used for readability. if not line.strip(): continue if line.startswith(b"#"): # Comment continue # Trailing spaces are ignored unless they are quoted with a backslash. while line.endswith(b" ") and not line.endswith(b"\\ "): line = line[:-1] line = line.replace(b"\\ ", b" ") yield line def match_pattern(path: bytes, pattern: bytes, ignorecase: bool = False) -> bool: """Match a gitignore-style pattern against a path. Args: path: Path to match pattern: Pattern to match ignorecase: Whether to do case-sensitive matching Returns: bool indicating whether the pattern matched """ return Pattern(pattern, ignorecase).match(path) class Pattern: """A single ignore pattern.""" def __init__(self, pattern: bytes, ignorecase: bool = False) -> None: """Initialize a Pattern object. Args: pattern: The gitignore pattern as bytes. ignorecase: Whether to perform case-insensitive matching. """ self.pattern = pattern self.ignorecase = ignorecase # Handle negation if pattern.startswith(b"!"): self.is_exclude = False pattern = pattern[1:] else: # Handle escaping of ! and # at start only if ( pattern.startswith(b"\\") and len(pattern) > 1 and pattern[1:2] in (b"!", b"#") ): pattern = pattern[1:] self.is_exclude = True # Check if this is a directory-only pattern self.is_directory_only = pattern.endswith(b"/") flags = 0 if self.ignorecase: flags = re.IGNORECASE self._re = re.compile(translate(pattern), flags) def __bytes__(self) -> bytes: """Return the pattern as bytes. Returns: The original pattern as bytes. """ return self.pattern def __str__(self) -> str: """Return the pattern as a string. Returns: The pattern decoded as a string. """ return os.fsdecode(self.pattern) def __eq__(self, other: object) -> bool: """Check equality with another Pattern object. Args: other: The object to compare with. Returns: True if patterns and ignorecase flags are equal, False otherwise. """ return ( isinstance(other, type(self)) and self.pattern == other.pattern and self.ignorecase == other.ignorecase ) def __repr__(self) -> str: """Return a string representation of the Pattern object. Returns: A string representation for debugging. """ return f"{type(self).__name__}({self.pattern!r}, {self.ignorecase!r})" def match(self, path: bytes) -> bool: """Try to match a path against this ignore pattern. Args: path: Path to match (relative to ignore location) Returns: boolean """ # For negation directory patterns (e.g., !dir/), only match directories if self.is_directory_only and not self.is_exclude and not path.endswith(b"/"): return False # Check if the regex matches if self._re.match(path): return True # For exclusion directory patterns, also match files under the directory if ( self.is_directory_only and self.is_exclude and not path.endswith(b"/") and b"/" in path ): return bool(self._re.match(path.rsplit(b"/", 1)[0] + b"/")) return False class IgnoreFilter: """Filter to apply gitignore patterns. Important: When checking if directories are ignored, include a trailing slash. For example, use is_ignored("dir/") instead of is_ignored("dir"). """ def __init__( self, patterns: Iterable[bytes], ignorecase: bool = False, path: str | None = None, ) -> None: """Initialize an IgnoreFilter with a set of patterns. Args: patterns: An iterable of gitignore patterns as bytes. ignorecase: Whether to perform case-insensitive matching. path: Optional path to the ignore file for debugging purposes. """ self._patterns: list[Pattern] = [] self._ignorecase = ignorecase self._path = path for pattern in patterns: self.append_pattern(pattern) def append_pattern(self, pattern: bytes) -> None: """Add a pattern to the set.""" self._patterns.append(Pattern(pattern, self._ignorecase)) def find_matching(self, path: bytes | str) -> Iterable[Pattern]: """Yield all matching patterns for path. Args: path: Path to match Returns: Iterator over iterators """ if not isinstance(path, bytes): path = os.fsencode(path) for pattern in self._patterns: if pattern.match(path): yield pattern def is_ignored(self, path: bytes | str) -> bool | None: """Check whether a path is ignored using Git-compliant logic. For directories, include a trailing slash. Returns: status is None if file is not mentioned, True if it is included, False if it is explicitly excluded. """ matching_patterns = list(self.find_matching(path)) if not matching_patterns: return None # Basic rule: last matching pattern wins last_pattern = matching_patterns[-1] result = last_pattern.is_exclude # Apply Git's parent directory exclusion rule for negations if not result: # Only applies to inclusions (negations) result = self._apply_parent_exclusion_rule( path.decode() if isinstance(path, bytes) else path, matching_patterns ) return result def _apply_parent_exclusion_rule( self, path: str, matching_patterns: list[Pattern] ) -> bool: """Apply Git's parent directory exclusion rule. "It is not possible to re-include a file if a parent directory of that file is excluded." """ return _check_parent_exclusion(path, matching_patterns) @classmethod def from_path( cls, path: str | os.PathLike[str], ignorecase: bool = False ) -> "IgnoreFilter": """Create an IgnoreFilter from a file path. Args: path: Path to the ignore file. ignorecase: Whether to perform case-insensitive matching. Returns: An IgnoreFilter instance with patterns loaded from the file. """ with open(path, "rb") as f: return cls(read_ignore_patterns(f), ignorecase, path=str(path)) def __repr__(self) -> str: """Return string representation of IgnoreFilter.""" path = getattr(self, "_path", None) if path is not None: return f"{type(self).__name__}.from_path({path!r})" else: return f"<{type(self).__name__}>" class IgnoreFilterStack: """Check for ignore status in multiple filters.""" def __init__(self, filters: list[IgnoreFilter]) -> None: """Initialize an IgnoreFilterStack with multiple filters. Args: filters: A list of IgnoreFilter objects to check in order. """ self._filters = filters def is_ignored(self, path: str) -> bool | None: """Check whether a path is explicitly included or excluded in ignores. Args: path: Path to check Returns: None if the file is not mentioned, True if it is included, False if it is explicitly excluded. """ for filter in self._filters: status = filter.is_ignored(path) if status is not None: return status return None def __repr__(self) -> str: """Return a string representation of the IgnoreFilterStack. Returns: A string representation for debugging. """ return f"{type(self).__name__}({self._filters!r})" def default_user_ignore_filter_path(config: Config) -> str: """Return default user ignore filter path. Args: config: A Config object Returns: Path to a global ignore file """ try: value = config.get((b"core",), b"excludesFile") assert isinstance(value, bytes) return value.decode(encoding="utf-8") except KeyError: pass return get_xdg_config_home_path("git", "ignore") class IgnoreFilterManager: """Ignore file manager with Git-compliant behavior. Important: When checking if directories are ignored, include a trailing slash. For example, use is_ignored("dir/") instead of is_ignored("dir"). """ def __init__( self, top_path: str, global_filters: list[IgnoreFilter], ignorecase: bool, ) -> None: """Initialize an IgnoreFilterManager. Args: top_path: The top-level directory path to manage ignores for. global_filters: List of global ignore filters to apply. ignorecase: Whether to perform case-insensitive matching. """ self._path_filters: dict[str, IgnoreFilter | None] = {} self._top_path = top_path self._global_filters = global_filters self._ignorecase = ignorecase def __repr__(self) -> str: """Return string representation of IgnoreFilterManager.""" return f"{type(self).__name__}({self._top_path}, {self._global_filters!r}, {self._ignorecase!r})" def _load_path(self, path: str) -> IgnoreFilter | None: try: return self._path_filters[path] except KeyError: pass p = os.path.join(self._top_path, path, ".gitignore") try: self._path_filters[path] = IgnoreFilter.from_path(p, self._ignorecase) except (FileNotFoundError, NotADirectoryError): self._path_filters[path] = None except OSError as e: # On Windows, opening a path that contains a symlink can fail with # errno 22 (Invalid argument) when the symlink points outside the repo if e.errno == 22: self._path_filters[path] = None else: raise return self._path_filters[path] def find_matching(self, path: str) -> Iterable[Pattern]: """Find matching patterns for path. Args: path: Path to check Returns: Iterator over Pattern instances """ if os.path.isabs(path): raise ValueError(f"{path} is an absolute path") filters = [(0, f) for f in self._global_filters] if os.path.sep != "/": path = path.replace(os.path.sep, "/") parts = path.split("/") matches = [] for i in range(len(parts) + 1): dirname = "/".join(parts[:i]) for s, f in filters: relpath = "/".join(parts[s:i]) if i < len(parts): # Paths leading up to the final part are all directories, # so need a trailing slash. relpath += "/" matches += list(f.find_matching(relpath)) ignore_filter = self._load_path(dirname) if ignore_filter is not None: filters.insert(0, (i, ignore_filter)) return iter(matches) def is_ignored(self, path: str) -> bool | None: """Check whether a path is explicitly included or excluded in ignores. Args: path: Path to check. For directories, the path should end with '/'. Returns: None if the file is not mentioned, True if it is included, False if it is explicitly excluded. """ matches = list(self.find_matching(path)) if not matches: return None # Standard behavior - last matching pattern wins result = matches[-1].is_exclude # Apply Git's parent directory exclusion rule for negations if not result: # Only check if we would include due to negation result = _check_parent_exclusion(path, matches) # Apply special case for issue #1203: directory traversal with ** patterns if result and path.endswith("/"): result = self._apply_directory_traversal_rule(path, matches) return result def _apply_directory_traversal_rule( self, path: str, matches: list["Pattern"] ) -> bool: """Apply directory traversal rule for issue #1203. If a directory would be ignored by a ** pattern, but there are negation patterns for its subdirectories, then the directory itself should not be ignored (to allow traversal). """ # Original logic for traversal check last_excluding_pattern = None for match in matches: if match.is_exclude: last_excluding_pattern = match if last_excluding_pattern and ( last_excluding_pattern.pattern.endswith(b"**") or b"**" in last_excluding_pattern.pattern ): # Check if subdirectories would be unignored test_subdir = path + "test/" test_matches = list(self.find_matching(test_subdir)) if test_matches: # Use standard logic for test case - last matching pattern wins test_result = test_matches[-1].is_exclude if test_result is False: return False return True # Keep original result @classmethod def from_repo(cls, repo: "Repo") -> "IgnoreFilterManager": """Create a IgnoreFilterManager from a repository. Args: repo: Repository object Returns: A `IgnoreFilterManager` object """ global_filters = [] for p in [ os.path.join(repo.controldir(), "info", "exclude"), default_user_ignore_filter_path(repo.get_config_stack()), ]: with suppress(OSError): global_filters.append(IgnoreFilter.from_path(os.path.expanduser(p))) config = repo.get_config_stack() ignorecase = config.get_boolean((b"core"), (b"ignorecase"), False) return cls(repo.path, global_filters, ignorecase) dulwich-1.0.0/dulwich/index.py000066400000000000000000003126701513301442600162770ustar00rootroot00000000000000# index.py -- File parser/writer for the git index file # Copyright (C) 2008-2013 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Parser for the git index file format.""" __all__ = [ "DEFAULT_VERSION", "EOIE_EXTENSION", "EXTENDED_FLAG_INTEND_TO_ADD", "EXTENDED_FLAG_SKIP_WORKTREE", "FLAG_EXTENDED", "FLAG_NAMEMASK", "FLAG_STAGEMASK", "FLAG_STAGESHIFT", "FLAG_VALID", "HFS_IGNORABLE_CHARS", "IEOT_EXTENSION", "INVALID_DOTNAMES", "REUC_EXTENSION", "SDIR_EXTENSION", "TREE_EXTENSION", "UNTR_EXTENSION", "Index", "IndexEntry", "IndexExtension", "ResolveUndoExtension", "SerializedIndexEntry", "SparseDirExtension", "Stage", "TreeDict", "TreeExtension", "UnmergedEntries", "UnsupportedIndexFormat", "UntrackedExtension", "blob_from_path_and_mode", "blob_from_path_and_stat", "build_file_from_blob", "build_index_from_tree", "changes_from_tree", "cleanup_mode", "commit_index", "commit_tree", "detect_case_only_renames", "get_path_element_normalizer", "get_unstaged_changes", "index_entry_from_stat", "pathjoin", "pathsplit", "read_cache_entry", "read_cache_time", "read_index", "read_index_dict", "read_index_dict_with_version", "read_index_header", "read_submodule_head", "update_working_tree", "validate_path", "validate_path_element_default", "validate_path_element_hfs", "validate_path_element_ntfs", "write_cache_entry", "write_cache_time", "write_index", "write_index_dict", "write_index_extension", ] import errno import os import shutil import stat import struct import sys import types from collections.abc import ( Callable, Generator, Iterable, Iterator, Mapping, Sequence, Set, ) from dataclasses import dataclass from enum import Enum from typing import ( IO, TYPE_CHECKING, Any, BinaryIO, ) if TYPE_CHECKING: from .config import Config from .diff_tree import TreeChange from .file import _GitFile from .filters import FilterBlobNormalizer from .object_store import BaseObjectStore from .repo import Repo from .file import GitFile from .object_store import iter_tree_contents from .objects import ( S_IFGITLINK, S_ISGITLINK, Blob, ObjectID, Tree, TreeEntry, hex_to_sha, sha_to_hex, ) from .pack import ObjectContainer, SHA1Reader, SHA1Writer # Type alias for recursive tree structure used in commit_tree TreeDict = dict[bytes, "TreeDict | tuple[int, ObjectID]"] # 2-bit stage (during merge) FLAG_STAGEMASK = 0x3000 FLAG_STAGESHIFT = 12 FLAG_NAMEMASK = 0x0FFF # assume-valid FLAG_VALID = 0x8000 # extended flag (must be zero in version 2) FLAG_EXTENDED = 0x4000 # used by sparse checkout EXTENDED_FLAG_SKIP_WORKTREE = 0x4000 # used by "git add -N" EXTENDED_FLAG_INTEND_TO_ADD = 0x2000 DEFAULT_VERSION = 2 # Index extension signatures TREE_EXTENSION = b"TREE" REUC_EXTENSION = b"REUC" UNTR_EXTENSION = b"UNTR" EOIE_EXTENSION = b"EOIE" IEOT_EXTENSION = b"IEOT" SDIR_EXTENSION = b"sdir" # Sparse directory extension def _encode_varint(value: int) -> bytes: """Encode an integer using variable-width encoding. Same format as used for OFS_DELTA pack entries and index v4 path compression. Uses 7 bits per byte, with the high bit indicating continuation. Args: value: Integer to encode Returns: Encoded bytes """ if value == 0: return b"\x00" result = [] while value > 0: byte = value & 0x7F # Take lower 7 bits value >>= 7 if value > 0: byte |= 0x80 # Set continuation bit result.append(byte) return bytes(result) def _decode_varint(data: bytes, offset: int = 0) -> tuple[int, int]: """Decode a variable-width encoded integer. Args: data: Bytes to decode from offset: Starting offset in data Returns: tuple of (decoded_value, new_offset) """ value = 0 shift = 0 pos = offset while pos < len(data): byte = data[pos] pos += 1 value |= (byte & 0x7F) << shift shift += 7 if not (byte & 0x80): # No continuation bit break return value, pos def _compress_path(path: bytes, previous_path: bytes) -> bytes: """Compress a path relative to the previous path for index version 4. Args: path: Path to compress previous_path: Previous path for comparison Returns: Compressed path data (varint prefix_len + suffix) """ # Find the common prefix length common_len = 0 min_len = min(len(path), len(previous_path)) for i in range(min_len): if path[i] == previous_path[i]: common_len += 1 else: break # The number of bytes to remove from the end of previous_path # to get the common prefix remove_len = len(previous_path) - common_len # The suffix to append suffix = path[common_len:] # Encode: varint(remove_len) + suffix + NUL return _encode_varint(remove_len) + suffix + b"\x00" def _decompress_path( data: bytes, offset: int, previous_path: bytes ) -> tuple[bytes, int]: """Decompress a path from index version 4 compressed format. Args: data: Raw data containing compressed path offset: Starting offset in data previous_path: Previous path for decompression Returns: tuple of (decompressed_path, new_offset) """ # Decode the number of bytes to remove from previous path remove_len, new_offset = _decode_varint(data, offset) # Find the NUL terminator for the suffix suffix_start = new_offset suffix_end = suffix_start while suffix_end < len(data) and data[suffix_end] != 0: suffix_end += 1 if suffix_end >= len(data): raise ValueError("Unterminated path suffix in compressed entry") suffix = data[suffix_start:suffix_end] new_offset = suffix_end + 1 # Skip the NUL terminator # Reconstruct the path if remove_len > len(previous_path): raise ValueError( f"Invalid path compression: trying to remove {remove_len} bytes from {len(previous_path)}-byte path" ) prefix = previous_path[:-remove_len] if remove_len > 0 else previous_path path = prefix + suffix return path, new_offset def _decompress_path_from_stream( f: BinaryIO, previous_path: bytes ) -> tuple[bytes, int]: """Decompress a path from index version 4 compressed format, reading from stream. Args: f: File-like object to read from previous_path: Previous path for decompression Returns: tuple of (decompressed_path, bytes_consumed) """ # Decode the varint for remove_len by reading byte by byte remove_len = 0 shift = 0 bytes_consumed = 0 while True: byte_data = f.read(1) if not byte_data: raise ValueError("Unexpected end of file while reading varint") byte = byte_data[0] bytes_consumed += 1 remove_len |= (byte & 0x7F) << shift shift += 7 if not (byte & 0x80): # No continuation bit break # Read the suffix until NUL terminator suffix = b"" while True: byte_data = f.read(1) if not byte_data: raise ValueError("Unexpected end of file while reading path suffix") byte = byte_data[0] bytes_consumed += 1 if byte == 0: # NUL terminator break suffix += bytes([byte]) # Reconstruct the path if remove_len > len(previous_path): raise ValueError( f"Invalid path compression: trying to remove {remove_len} bytes from {len(previous_path)}-byte path" ) prefix = previous_path[:-remove_len] if remove_len > 0 else previous_path path = prefix + suffix return path, bytes_consumed class Stage(Enum): """Represents the stage of an index entry during merge conflicts.""" NORMAL = 0 MERGE_CONFLICT_ANCESTOR = 1 MERGE_CONFLICT_THIS = 2 MERGE_CONFLICT_OTHER = 3 @dataclass class SerializedIndexEntry: """Represents a serialized index entry as stored in the index file. This dataclass holds the raw data for an index entry before it's parsed into the more user-friendly IndexEntry format. """ name: bytes ctime: int | float | tuple[int, int] mtime: int | float | tuple[int, int] dev: int ino: int mode: int uid: int gid: int size: int sha: ObjectID flags: int extended_flags: int def stage(self) -> Stage: """Extract the stage from the flags field. Returns: Stage enum value indicating merge conflict state """ return Stage((self.flags & FLAG_STAGEMASK) >> FLAG_STAGESHIFT) def is_sparse_dir(self) -> bool: """Check if this entry represents a sparse directory. A sparse directory entry is a collapsed representation of an entire directory tree in a sparse index. It has: - Directory mode (0o040000) - SKIP_WORKTREE flag set - Path ending with '/' - SHA pointing to a tree object Returns: True if entry is a sparse directory entry """ return ( stat.S_ISDIR(self.mode) and bool(self.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE) and self.name.endswith(b"/") ) @dataclass class IndexExtension: """Base class for index extensions.""" signature: bytes data: bytes @classmethod def from_raw(cls, signature: bytes, data: bytes) -> "IndexExtension": """Create an extension from raw data. Args: signature: 4-byte extension signature data: Extension data Returns: Parsed extension object """ if signature == TREE_EXTENSION: return TreeExtension.from_bytes(data) elif signature == REUC_EXTENSION: return ResolveUndoExtension.from_bytes(data) elif signature == UNTR_EXTENSION: return UntrackedExtension.from_bytes(data) elif signature == SDIR_EXTENSION: return SparseDirExtension.from_bytes(data) else: # Unknown extension - just store raw data return cls(signature, data) def to_bytes(self) -> bytes: """Serialize extension to bytes.""" return self.data class TreeExtension(IndexExtension): """Tree cache extension.""" def __init__(self, entries: list[tuple[bytes, bytes, int]]) -> None: """Initialize TreeExtension. Args: entries: List of tree cache entries (path, sha, flags) """ self.entries = entries super().__init__(TREE_EXTENSION, b"") @classmethod def from_bytes(cls, data: bytes) -> "TreeExtension": """Parse TreeExtension from bytes. Args: data: Raw bytes to parse Returns: TreeExtension instance """ # TODO: Implement tree cache parsing return cls([]) def to_bytes(self) -> bytes: """Serialize TreeExtension to bytes. Returns: Serialized extension data """ # TODO: Implement tree cache serialization return b"" class ResolveUndoExtension(IndexExtension): """Resolve undo extension for recording merge conflicts.""" def __init__(self, entries: list[tuple[bytes, list[tuple[int, bytes]]]]) -> None: """Initialize ResolveUndoExtension. Args: entries: List of (path, stages) where stages is a list of (stage, sha) tuples """ self.entries = entries super().__init__(REUC_EXTENSION, b"") @classmethod def from_bytes(cls, data: bytes) -> "ResolveUndoExtension": """Parse ResolveUndoExtension from bytes. Args: data: Raw bytes to parse Returns: ResolveUndoExtension instance """ # TODO: Implement resolve undo parsing return cls([]) def to_bytes(self) -> bytes: """Serialize ResolveUndoExtension to bytes. Returns: Serialized extension data """ # TODO: Implement resolve undo serialization return b"" class UntrackedExtension(IndexExtension): """Untracked cache extension.""" def __init__(self, data: bytes) -> None: """Initialize UntrackedExtension. Args: data: Raw untracked cache data """ super().__init__(UNTR_EXTENSION, data) @classmethod def from_bytes(cls, data: bytes) -> "UntrackedExtension": """Parse UntrackedExtension from bytes. Args: data: Raw bytes to parse Returns: UntrackedExtension instance """ return cls(data) class SparseDirExtension(IndexExtension): """Sparse directory extension. This extension indicates that the index contains sparse directory entries. Tools that don't understand sparse index should avoid interacting with the index when this extension is present. The extension data is empty - its presence is the signal. """ def __init__(self) -> None: """Initialize SparseDirExtension.""" super().__init__(SDIR_EXTENSION, b"") @classmethod def from_bytes(cls, data: bytes) -> "SparseDirExtension": """Parse SparseDirExtension from bytes. Args: data: Raw bytes to parse (should be empty) Returns: SparseDirExtension instance """ return cls() def to_bytes(self) -> bytes: """Serialize SparseDirExtension to bytes. Returns: Empty bytes (extension presence is the signal) """ return b"" @dataclass class IndexEntry: """Represents an entry in the Git index. This is a higher-level representation of an index entry that includes parsed data and convenience methods. """ ctime: int | float | tuple[int, int] mtime: int | float | tuple[int, int] dev: int ino: int mode: int uid: int gid: int size: int sha: ObjectID flags: int = 0 extended_flags: int = 0 @classmethod def from_serialized(cls, serialized: SerializedIndexEntry) -> "IndexEntry": """Create an IndexEntry from a SerializedIndexEntry. Args: serialized: SerializedIndexEntry to convert Returns: New IndexEntry instance """ return cls( ctime=serialized.ctime, mtime=serialized.mtime, dev=serialized.dev, ino=serialized.ino, mode=serialized.mode, uid=serialized.uid, gid=serialized.gid, size=serialized.size, sha=serialized.sha, flags=serialized.flags, extended_flags=serialized.extended_flags, ) def serialize(self, name: bytes, stage: Stage) -> SerializedIndexEntry: """Serialize this entry with a given name and stage. Args: name: Path name for the entry stage: Merge conflict stage Returns: SerializedIndexEntry ready for writing to disk """ # Clear out any existing stage bits, then set them from the Stage. new_flags = self.flags & ~FLAG_STAGEMASK new_flags |= stage.value << FLAG_STAGESHIFT return SerializedIndexEntry( name=name, ctime=self.ctime, mtime=self.mtime, dev=self.dev, ino=self.ino, mode=self.mode, uid=self.uid, gid=self.gid, size=self.size, sha=self.sha, flags=new_flags, extended_flags=self.extended_flags, ) def stage(self) -> Stage: """Get the merge conflict stage of this entry. Returns: Stage enum value """ return Stage((self.flags & FLAG_STAGEMASK) >> FLAG_STAGESHIFT) @property def skip_worktree(self) -> bool: """Return True if the skip-worktree bit is set in extended_flags.""" return bool(self.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE) def set_skip_worktree(self, skip: bool = True) -> None: """Helper method to set or clear the skip-worktree bit in extended_flags. Also sets FLAG_EXTENDED in self.flags if needed. """ if skip: # Turn on the skip-worktree bit self.extended_flags |= EXTENDED_FLAG_SKIP_WORKTREE # Also ensure the main 'extended' bit is set in flags self.flags |= FLAG_EXTENDED else: # Turn off the skip-worktree bit self.extended_flags &= ~EXTENDED_FLAG_SKIP_WORKTREE # Optionally unset the main extended bit if no extended flags remain if self.extended_flags == 0: self.flags &= ~FLAG_EXTENDED def is_sparse_dir(self, name: bytes) -> bool: """Check if this entry represents a sparse directory. A sparse directory entry is a collapsed representation of an entire directory tree in a sparse index. It has: - Directory mode (0o040000) - SKIP_WORKTREE flag set - Path ending with '/' - SHA pointing to a tree object Args: name: The path name for this entry (IndexEntry doesn't store name) Returns: True if entry is a sparse directory entry """ return ( stat.S_ISDIR(self.mode) and bool(self.extended_flags & EXTENDED_FLAG_SKIP_WORKTREE) and name.endswith(b"/") ) class ConflictedIndexEntry: """Index entry that represents a conflict.""" ancestor: IndexEntry | None this: IndexEntry | None other: IndexEntry | None def __init__( self, ancestor: IndexEntry | None = None, this: IndexEntry | None = None, other: IndexEntry | None = None, ) -> None: """Initialize ConflictedIndexEntry. Args: ancestor: The common ancestor entry this: The current branch entry other: The other branch entry """ self.ancestor = ancestor self.this = this self.other = other class UnmergedEntries(Exception): """Unmerged entries exist in the index.""" def pathsplit(path: bytes) -> tuple[bytes, bytes]: """Split a /-delimited path into a directory part and a basename. Args: path: The path to split. Returns: Tuple with directory name and basename """ try: (dirname, basename) = path.rsplit(b"/", 1) except ValueError: return (b"", path) else: return (dirname, basename) def pathjoin(*args: bytes) -> bytes: """Join a /-delimited path.""" return b"/".join([p for p in args if p]) def read_cache_time(f: BinaryIO) -> tuple[int, int]: """Read a cache time. Args: f: File-like object to read from Returns: Tuple with seconds and nanoseconds """ return struct.unpack(">LL", f.read(8)) def write_cache_time(f: IO[bytes], t: int | float | tuple[int, int]) -> None: """Write a cache time. Args: f: File-like object to write to t: Time to write (as int, float or tuple with secs and nsecs) """ if isinstance(t, int): t = (t, 0) elif isinstance(t, float): (secs, nsecs) = divmod(t, 1.0) t = (int(secs), int(nsecs * 1000000000)) elif not isinstance(t, tuple): raise TypeError(t) f.write(struct.pack(">LL", *t)) def read_cache_entry( f: BinaryIO, version: int, previous_path: bytes = b"" ) -> SerializedIndexEntry: """Read an entry from a cache file. Args: f: File-like object to read from version: Index version previous_path: Previous entry's path (for version 4 compression) """ beginoffset = f.tell() ctime = read_cache_time(f) mtime = read_cache_time(f) ( dev, ino, mode, uid, gid, size, sha, flags, ) = struct.unpack(">LLLLLL20sH", f.read(20 + 4 * 6 + 2)) if flags & FLAG_EXTENDED: if version < 3: raise AssertionError("extended flag set in index with version < 3") (extended_flags,) = struct.unpack(">H", f.read(2)) else: extended_flags = 0 if version >= 4: # Version 4: paths are always compressed (name_len should be 0) name, _consumed = _decompress_path_from_stream(f, previous_path) else: # Versions < 4: regular name reading name = f.read(flags & FLAG_NAMEMASK) # Padding: if version < 4: real_size = (f.tell() - beginoffset + 8) & ~7 f.read((beginoffset + real_size) - f.tell()) return SerializedIndexEntry( name, ctime, mtime, dev, ino, mode, uid, gid, size, sha_to_hex(sha), flags & ~FLAG_NAMEMASK, extended_flags, ) def write_cache_entry( f: IO[bytes], entry: SerializedIndexEntry, version: int, previous_path: bytes = b"" ) -> None: """Write an index entry to a file. Args: f: File object entry: IndexEntry to write version: Index format version previous_path: Previous entry's path (for version 4 compression) """ beginoffset = f.tell() write_cache_time(f, entry.ctime) write_cache_time(f, entry.mtime) if version >= 4: # Version 4: use compression but set name_len to actual filename length # This matches how C Git implements index v4 flags compressed_path = _compress_path(entry.name, previous_path) flags = len(entry.name) | (entry.flags & ~FLAG_NAMEMASK) else: # Versions < 4: include actual name length flags = len(entry.name) | (entry.flags & ~FLAG_NAMEMASK) if entry.extended_flags: flags |= FLAG_EXTENDED if flags & FLAG_EXTENDED and version is not None and version < 3: raise AssertionError("unable to use extended flags in version < 3") f.write( struct.pack( b">LLLLLL20sH", entry.dev & 0xFFFFFFFF, entry.ino & 0xFFFFFFFF, entry.mode, entry.uid, entry.gid, entry.size, hex_to_sha(entry.sha), flags, ) ) if flags & FLAG_EXTENDED: f.write(struct.pack(b">H", entry.extended_flags)) if version >= 4: # Version 4: always write compressed path f.write(compressed_path) else: # Versions < 4: write regular path and padding f.write(entry.name) real_size = (f.tell() - beginoffset + 8) & ~7 f.write(b"\0" * ((beginoffset + real_size) - f.tell())) class UnsupportedIndexFormat(Exception): """An unsupported index format was encountered.""" def __init__(self, version: int) -> None: """Initialize UnsupportedIndexFormat exception. Args: version: The unsupported index format version """ self.index_format_version = version def read_index_header(f: BinaryIO) -> tuple[int, int]: """Read an index header from a file. Returns: tuple of (version, num_entries) """ header = f.read(4) if header != b"DIRC": raise AssertionError(f"Invalid index file header: {header!r}") (version, num_entries) = struct.unpack(b">LL", f.read(4 * 2)) if version not in (1, 2, 3, 4): raise UnsupportedIndexFormat(version) return version, num_entries def write_index_extension(f: IO[bytes], extension: IndexExtension) -> None: """Write an index extension. Args: f: File-like object to write to extension: Extension to write """ data = extension.to_bytes() f.write(extension.signature) f.write(struct.pack(">I", len(data))) f.write(data) def read_index(f: BinaryIO) -> Iterator[SerializedIndexEntry]: """Read an index file, yielding the individual entries.""" version, num_entries = read_index_header(f) previous_path = b"" for i in range(num_entries): entry = read_cache_entry(f, version, previous_path) previous_path = entry.name yield entry def read_index_dict_with_version( f: BinaryIO, ) -> tuple[dict[bytes, IndexEntry | ConflictedIndexEntry], int, list[IndexExtension]]: """Read an index file and return it as a dictionary along with the version. Returns: tuple of (entries_dict, version, extensions) """ version, num_entries = read_index_header(f) ret: dict[bytes, IndexEntry | ConflictedIndexEntry] = {} previous_path = b"" for i in range(num_entries): entry = read_cache_entry(f, version, previous_path) previous_path = entry.name stage = entry.stage() if stage == Stage.NORMAL: ret[entry.name] = IndexEntry.from_serialized(entry) else: existing = ret.setdefault(entry.name, ConflictedIndexEntry()) if isinstance(existing, IndexEntry): raise AssertionError(f"Non-conflicted entry for {entry.name!r} exists") if stage == Stage.MERGE_CONFLICT_ANCESTOR: existing.ancestor = IndexEntry.from_serialized(entry) elif stage == Stage.MERGE_CONFLICT_THIS: existing.this = IndexEntry.from_serialized(entry) elif stage == Stage.MERGE_CONFLICT_OTHER: existing.other = IndexEntry.from_serialized(entry) # Read extensions extensions = [] while True: # Check if we're at the end (20 bytes before EOF for SHA checksum) current_pos = f.tell() f.seek(0, 2) # EOF eof_pos = f.tell() f.seek(current_pos) if current_pos >= eof_pos - 20: break # Try to read extension signature signature = f.read(4) if len(signature) < 4: break # Check if it's a valid extension signature (4 uppercase letters) if not all(65 <= b <= 90 for b in signature): # Not an extension, seek back f.seek(-4, 1) break # Read extension size size_data = f.read(4) if len(size_data) < 4: break size = struct.unpack(">I", size_data)[0] # Read extension data data = f.read(size) if len(data) < size: break extension = IndexExtension.from_raw(signature, data) extensions.append(extension) return ret, version, extensions def read_index_dict( f: BinaryIO, ) -> dict[bytes, IndexEntry | ConflictedIndexEntry]: """Read an index file and return it as a dictionary. Dict Key is tuple of path and stage number, as path alone is not unique Args: f: File object to read fromls. """ ret: dict[bytes, IndexEntry | ConflictedIndexEntry] = {} for entry in read_index(f): stage = entry.stage() if stage == Stage.NORMAL: ret[entry.name] = IndexEntry.from_serialized(entry) else: existing = ret.setdefault(entry.name, ConflictedIndexEntry()) if isinstance(existing, IndexEntry): raise AssertionError(f"Non-conflicted entry for {entry.name!r} exists") if stage == Stage.MERGE_CONFLICT_ANCESTOR: existing.ancestor = IndexEntry.from_serialized(entry) elif stage == Stage.MERGE_CONFLICT_THIS: existing.this = IndexEntry.from_serialized(entry) elif stage == Stage.MERGE_CONFLICT_OTHER: existing.other = IndexEntry.from_serialized(entry) return ret def write_index( f: IO[bytes], entries: Sequence[SerializedIndexEntry], version: int | None = None, extensions: Sequence[IndexExtension] | None = None, ) -> None: """Write an index file. Args: f: File-like object to write to version: Version number to write entries: Iterable over the entries to write extensions: Optional list of extensions to write """ if version is None: version = DEFAULT_VERSION # STEP 1: check if any extended_flags are set uses_extended_flags = any(e.extended_flags != 0 for e in entries) if uses_extended_flags and version < 3: # Force or bump the version to 3 version = 3 # The rest is unchanged, but you might insert a final check: if version < 3: # Double-check no extended flags appear for e in entries: if e.extended_flags != 0: raise AssertionError("Attempt to use extended flags in index < v3") # Proceed with the existing code to write the header and entries. f.write(b"DIRC") f.write(struct.pack(b">LL", version, len(entries))) previous_path = b"" for entry in entries: write_cache_entry(f, entry, version=version, previous_path=previous_path) previous_path = entry.name # Write extensions if extensions: for extension in extensions: write_index_extension(f, extension) def write_index_dict( f: IO[bytes], entries: Mapping[bytes, IndexEntry | ConflictedIndexEntry], version: int | None = None, extensions: Sequence[IndexExtension] | None = None, ) -> None: """Write an index file based on the contents of a dictionary. being careful to sort by path and then by stage. """ entries_list = [] for key in sorted(entries): value = entries[key] if isinstance(value, ConflictedIndexEntry): if value.ancestor is not None: entries_list.append( value.ancestor.serialize(key, Stage.MERGE_CONFLICT_ANCESTOR) ) if value.this is not None: entries_list.append( value.this.serialize(key, Stage.MERGE_CONFLICT_THIS) ) if value.other is not None: entries_list.append( value.other.serialize(key, Stage.MERGE_CONFLICT_OTHER) ) else: entries_list.append(value.serialize(key, Stage.NORMAL)) write_index(f, entries_list, version=version, extensions=extensions) def cleanup_mode(mode: int) -> int: """Cleanup a mode value. This will return a mode that can be stored in a tree object. Args: mode: Mode to clean up. Returns: mode """ if stat.S_ISLNK(mode): return stat.S_IFLNK elif stat.S_ISDIR(mode): return stat.S_IFDIR elif S_ISGITLINK(mode): return S_IFGITLINK ret = stat.S_IFREG | 0o644 if mode & 0o100: ret |= 0o111 return ret class Index: """A Git Index file.""" _byname: dict[bytes, IndexEntry | ConflictedIndexEntry] def __init__( self, filename: bytes | str | os.PathLike[str], read: bool = True, skip_hash: bool = False, version: int | None = None, *, file_mode: int | None = None, ) -> None: """Create an index object associated with the given filename. Args: filename: Path to the index file read: Whether to initialize the index from the given file, should it exist. skip_hash: Whether to skip SHA1 hash when writing (for manyfiles feature) version: Index format version to use (None = auto-detect from file or use default) file_mode: Optional file permission mask for shared repository """ self._filename = os.fspath(filename) # TODO(jelmer): Store the version returned by read_index self._version = version self._skip_hash = skip_hash self._file_mode = file_mode self._extensions: list[IndexExtension] = [] self.clear() if read: self.read() @property def path(self) -> bytes | str: """Get the path to the index file. Returns: Path to the index file """ return self._filename def __repr__(self) -> str: """Return string representation of Index.""" return f"{self.__class__.__name__}({self._filename!r})" def write(self) -> None: """Write current contents of index to disk.""" mask = self._file_mode if self._file_mode is not None else 0o644 f = GitFile(self._filename, "wb", mask=mask) try: # Filter out extensions with no meaningful data meaningful_extensions = [] for ext in self._extensions: # Skip extensions that have empty data ext_data = ext.to_bytes() if ext_data: meaningful_extensions.append(ext) if self._skip_hash: # When skipHash is enabled, write the index without computing SHA1 write_index_dict( f, self._byname, version=self._version, extensions=meaningful_extensions, ) # Write 20 zero bytes instead of SHA1 f.write(b"\x00" * 20) f.close() else: sha1_writer = SHA1Writer(f) write_index_dict( sha1_writer, self._byname, version=self._version, extensions=meaningful_extensions, ) sha1_writer.close() except: f.close() raise def read(self) -> None: """Read current contents of index from disk.""" if not os.path.exists(self._filename): return f = GitFile(self._filename, "rb") try: sha1_reader = SHA1Reader(f) entries, version, extensions = read_index_dict_with_version(sha1_reader) self._version = version self._extensions = extensions self.update(entries) # Extensions have already been read by read_index_dict_with_version sha1_reader.check_sha(allow_empty=True) finally: f.close() def __len__(self) -> int: """Number of entries in this index file.""" return len(self._byname) def __getitem__(self, key: bytes) -> IndexEntry | ConflictedIndexEntry: """Retrieve entry by relative path and stage. Returns: Either a IndexEntry or a ConflictedIndexEntry Raises KeyError: if the entry does not exist """ return self._byname[key] def __iter__(self) -> Iterator[bytes]: """Iterate over the paths and stages in this index.""" return iter(self._byname) def __contains__(self, key: bytes) -> bool: """Check if a path exists in the index.""" return key in self._byname def get_sha1(self, path: bytes) -> ObjectID: """Return the (git object) SHA1 for the object at a path.""" value = self[path] if isinstance(value, ConflictedIndexEntry): raise UnmergedEntries return value.sha def get_mode(self, path: bytes) -> int: """Return the POSIX file mode for the object at a path.""" value = self[path] if isinstance(value, ConflictedIndexEntry): raise UnmergedEntries return value.mode def iterobjects(self) -> Iterable[tuple[bytes, ObjectID, int]]: """Iterate over path, sha, mode tuples for use with commit_tree.""" for path in self: entry = self[path] if isinstance(entry, ConflictedIndexEntry): raise UnmergedEntries yield path, entry.sha, cleanup_mode(entry.mode) def has_conflicts(self) -> bool: """Check if the index contains any conflicted entries. Returns: True if any entries are conflicted, False otherwise """ for value in self._byname.values(): if isinstance(value, ConflictedIndexEntry): return True return False def clear(self) -> None: """Remove all contents from this index.""" self._byname = {} def __setitem__( self, name: bytes, value: IndexEntry | ConflictedIndexEntry ) -> None: """Set an entry in the index.""" assert isinstance(name, bytes) self._byname[name] = value def __delitem__(self, name: bytes) -> None: """Delete an entry from the index.""" del self._byname[name] def iteritems( self, ) -> Iterator[tuple[bytes, IndexEntry | ConflictedIndexEntry]]: """Iterate over (path, entry) pairs in the index. Returns: Iterator of (path, entry) tuples """ return iter(self._byname.items()) def items(self) -> Iterator[tuple[bytes, IndexEntry | ConflictedIndexEntry]]: """Get an iterator over (path, entry) pairs. Returns: Iterator of (path, entry) tuples """ return iter(self._byname.items()) def update(self, entries: dict[bytes, IndexEntry | ConflictedIndexEntry]) -> None: """Update the index with multiple entries. Args: entries: Dictionary mapping paths to index entries """ for key, value in entries.items(): self[key] = value def paths(self) -> Generator[bytes, None, None]: """Generate all paths in the index. Yields: Path names as bytes """ yield from self._byname.keys() def changes_from_tree( self, object_store: ObjectContainer, tree: ObjectID, want_unchanged: bool = False, ) -> Generator[ tuple[ tuple[bytes | None, bytes | None], tuple[int | None, int | None], tuple[bytes | None, bytes | None], ], None, None, ]: """Find the differences between the contents of this index and a tree. Args: object_store: Object store to use for retrieving tree contents tree: SHA1 of the root tree want_unchanged: Whether unchanged files should be reported Returns: Iterator over tuples with (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) """ def lookup_entry(path: bytes) -> tuple[bytes, int]: entry = self[path] if hasattr(entry, "sha") and hasattr(entry, "mode"): return entry.sha, cleanup_mode(entry.mode) else: # Handle ConflictedIndexEntry case return b"", 0 yield from changes_from_tree( self.paths(), lookup_entry, object_store, tree, want_unchanged=want_unchanged, ) def commit(self, object_store: ObjectContainer) -> ObjectID: """Create a new tree from an index. Args: object_store: Object store to save the tree in Returns: Root tree SHA """ return commit_tree(object_store, self.iterobjects()) def is_sparse(self) -> bool: """Check if this index contains sparse directory entries. Returns: True if any sparse directory extension is present """ return any(isinstance(ext, SparseDirExtension) for ext in self._extensions) def ensure_full_index(self, object_store: "BaseObjectStore") -> None: """Expand all sparse directory entries into full file entries. This converts a sparse index into a full index by recursively expanding any sparse directory entries into their constituent files. Args: object_store: Object store to read tree objects from Raises: KeyError: If a tree object referenced by a sparse dir entry doesn't exist """ if not self.is_sparse(): return # Find all sparse directory entries sparse_dirs = [] for path, entry in list(self._byname.items()): if isinstance(entry, IndexEntry) and entry.is_sparse_dir(path): sparse_dirs.append((path, entry)) # Expand each sparse directory for path, entry in sparse_dirs: # Remove the sparse directory entry del self._byname[path] # Get the tree object tree = object_store[entry.sha] if not isinstance(tree, Tree): raise ValueError(f"Sparse directory {path!r} points to non-tree object") # Recursively add all entries from the tree self._expand_tree(path.rstrip(b"/"), tree, object_store, entry) # Remove the sparse directory extension self._extensions = [ ext for ext in self._extensions if not isinstance(ext, SparseDirExtension) ] def _expand_tree( self, prefix: bytes, tree: Tree, object_store: "BaseObjectStore", template_entry: IndexEntry, ) -> None: """Recursively expand a tree into index entries. Args: prefix: Path prefix for entries (without trailing slash) tree: Tree object to expand object_store: Object store to read nested trees from template_entry: Template entry to copy metadata from """ for name, mode, sha in tree.items(): if prefix: full_path = prefix + b"/" + name else: full_path = name if stat.S_ISDIR(mode): # Recursively expand subdirectories subtree = object_store[sha] if not isinstance(subtree, Tree): raise ValueError( f"Directory entry {full_path!r} points to non-tree object" ) self._expand_tree(full_path, subtree, object_store, template_entry) else: # Create an index entry for this file # Use the template entry for metadata but with the file's sha and mode new_entry = IndexEntry( ctime=template_entry.ctime, mtime=template_entry.mtime, dev=template_entry.dev, ino=template_entry.ino, mode=mode, uid=template_entry.uid, gid=template_entry.gid, size=0, # Size is unknown from tree sha=sha, flags=0, extended_flags=0, # Don't copy skip-worktree flag ) self._byname[full_path] = new_entry def convert_to_sparse( self, object_store: "BaseObjectStore", tree_sha: ObjectID, sparse_dirs: Set[bytes], ) -> None: """Convert full index entries to sparse directory entries. This collapses directories that are entirely outside the sparse checkout cone into single sparse directory entries. Args: object_store: Object store to read tree objects tree_sha: SHA of the tree (usually HEAD) to base sparse dirs on sparse_dirs: Set of directory paths (with trailing /) to collapse Raises: KeyError: If tree_sha or a subdirectory doesn't exist """ if not sparse_dirs: return # Get the base tree tree = object_store[tree_sha] if not isinstance(tree, Tree): raise ValueError(f"tree_sha {tree_sha!r} is not a tree object") # For each sparse directory, find its tree SHA and create sparse entry for dir_path in sparse_dirs: dir_path_stripped = dir_path.rstrip(b"/") # Find the tree SHA for this directory subtree_sha = self._find_subtree_sha(tree, dir_path_stripped, object_store) if subtree_sha is None: # Directory doesn't exist in tree, skip it continue # Remove all entries under this directory entries_to_remove = [ path for path in self._byname if path.startswith(dir_path) or path == dir_path_stripped ] for path in entries_to_remove: del self._byname[path] # Create a sparse directory entry # Use minimal metadata since it's not a real file from dulwich.objects import ObjectID sparse_entry = IndexEntry( ctime=0, mtime=0, dev=0, ino=0, mode=stat.S_IFDIR, uid=0, gid=0, size=0, sha=ObjectID(subtree_sha), flags=0, extended_flags=EXTENDED_FLAG_SKIP_WORKTREE, ) self._byname[dir_path] = sparse_entry # Add sparse directory extension if not present if not self.is_sparse(): self._extensions.append(SparseDirExtension()) def _find_subtree_sha( self, tree: Tree, path: bytes, object_store: "BaseObjectStore", ) -> bytes | None: """Find the SHA of a subtree at a given path. Args: tree: Root tree object to search in path: Path to the subdirectory (no trailing slash) object_store: Object store to read nested trees from Returns: SHA of the subtree, or None if path doesn't exist """ if not path: return tree.id parts = path.split(b"/") current_tree = tree for part in parts: # Look for this part in the current tree try: mode, sha = current_tree[part] except KeyError: return None if not stat.S_ISDIR(mode): # Path component is a file, not a directory return None # Load the next tree obj = object_store[sha] if not isinstance(obj, Tree): return None current_tree = obj return current_tree.id def commit_tree( object_store: ObjectContainer, blobs: Iterable[tuple[bytes, ObjectID, int]] ) -> ObjectID: """Commit a new tree. Args: object_store: Object store to add trees to blobs: Iterable over blob path, sha, mode entries Returns: SHA1 of the created tree. """ trees: dict[bytes, TreeDict] = {b"": {}} def add_tree(path: bytes) -> TreeDict: if path in trees: return trees[path] dirname, basename = pathsplit(path) t = add_tree(dirname) assert isinstance(basename, bytes) newtree: TreeDict = {} t[basename] = newtree trees[path] = newtree return newtree for path, sha, mode in blobs: tree_path, basename = pathsplit(path) tree = add_tree(tree_path) tree[basename] = (mode, sha) def build_tree(path: bytes) -> ObjectID: tree = Tree() for basename, entry in trees[path].items(): if isinstance(entry, dict): mode = stat.S_IFDIR sha = build_tree(pathjoin(path, basename)) else: (mode, sha) = entry tree.add(basename, mode, sha) object_store.add_object(tree) return tree.id return build_tree(b"") def commit_index(object_store: ObjectContainer, index: Index) -> ObjectID: """Create a new tree from an index. Args: object_store: Object store to save the tree in index: Index file Note: This function is deprecated, use index.commit() instead. Returns: Root tree sha. """ return commit_tree(object_store, index.iterobjects()) def changes_from_tree( names: Iterable[bytes], lookup_entry: Callable[[bytes], tuple[bytes, int]], object_store: ObjectContainer, tree: ObjectID | None, want_unchanged: bool = False, ) -> Iterable[ tuple[ tuple[bytes | None, bytes | None], tuple[int | None, int | None], tuple[bytes | None, bytes | None], ] ]: """Find the differences between the contents of a tree and a working copy. Args: names: Iterable of names in the working copy lookup_entry: Function to lookup an entry in the working copy object_store: Object store to use for retrieving tree contents tree: SHA1 of the root tree, or None for an empty tree want_unchanged: Whether unchanged files should be reported Returns: Iterator over tuples with (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) """ # TODO(jelmer): Support a include_trees option other_names = set(names) if tree is not None: for name, mode, sha in iter_tree_contents(object_store, tree): assert name is not None and mode is not None and sha is not None try: (other_sha, other_mode) = lookup_entry(name) except KeyError: # Was removed yield ((name, None), (mode, None), (sha, None)) else: other_names.remove(name) if want_unchanged or other_sha != sha or other_mode != mode: yield ((name, name), (mode, other_mode), (sha, other_sha)) # Mention added files for name in other_names: try: (other_sha, other_mode) = lookup_entry(name) except KeyError: pass else: yield ((None, name), (None, other_mode), (None, other_sha)) def index_entry_from_stat( stat_val: os.stat_result, hex_sha: bytes, mode: int | None = None, ) -> IndexEntry: """Create a new index entry from a stat value. Args: stat_val: POSIX stat_result instance hex_sha: Hex sha of the object mode: Optional file mode, will be derived from stat if not provided """ if mode is None: mode = cleanup_mode(stat_val.st_mode) from dulwich.objects import ObjectID # Use nanosecond precision when available to avoid precision loss # through float representation ctime: int | float | tuple[int, int] mtime: int | float | tuple[int, int] st_ctime_ns = getattr(stat_val, "st_ctime_ns", None) if st_ctime_ns is not None: ctime = ( st_ctime_ns // 1_000_000_000, st_ctime_ns % 1_000_000_000, ) else: ctime = stat_val.st_ctime st_mtime_ns = getattr(stat_val, "st_mtime_ns", None) if st_mtime_ns is not None: mtime = ( st_mtime_ns // 1_000_000_000, st_mtime_ns % 1_000_000_000, ) else: mtime = stat_val.st_mtime return IndexEntry( ctime=ctime, mtime=mtime, dev=stat_val.st_dev, ino=stat_val.st_ino, mode=mode, uid=stat_val.st_uid, gid=stat_val.st_gid, size=stat_val.st_size, sha=ObjectID(hex_sha), flags=0, extended_flags=0, ) if sys.platform == "win32": # On Windows, creating symlinks either requires administrator privileges # or developer mode. Raise a more helpful error when we're unable to # create symlinks # https://github.com/jelmer/dulwich/issues/1005 class WindowsSymlinkPermissionError(PermissionError): """Windows-specific error for symlink creation failures. This error is raised when symlink creation fails on Windows, typically due to lack of developer mode or administrator privileges. """ def __init__(self, errno: int, msg: str, filename: str | None) -> None: """Initialize WindowsSymlinkPermissionError.""" super().__init__( errno, f"Unable to create symlink; do you have developer mode enabled? {msg}", filename, ) def symlink( src: str | bytes, dst: str | bytes, target_is_directory: bool = False, *, dir_fd: int | None = None, ) -> None: """Create a symbolic link on Windows with better error handling. Args: src: Source path for the symlink dst: Destination path where symlink will be created target_is_directory: Whether the target is a directory dir_fd: Optional directory file descriptor Raises: WindowsSymlinkPermissionError: If symlink creation fails due to permissions """ try: return os.symlink( src, dst, target_is_directory=target_is_directory, dir_fd=dir_fd ) except PermissionError as e: raise WindowsSymlinkPermissionError( e.errno or 0, e.strerror or "", e.filename ) from e else: symlink = os.symlink def build_file_from_blob( blob: Blob, mode: int, target_path: bytes, *, honor_filemode: bool = True, tree_encoding: str = "utf-8", symlink_fn: Callable[ [str | bytes | os.PathLike[str], str | bytes | os.PathLike[str]], None ] | None = None, ) -> os.stat_result: """Build a file or symlink on disk based on a Git object. Args: blob: The git object mode: File mode target_path: Path to write to honor_filemode: An optional flag to honor core.filemode setting in config file, default is core.filemode=True, change executable bit tree_encoding: Encoding to use for tree contents symlink_fn: Function to use for creating symlinks Returns: stat object for the file """ try: oldstat = os.lstat(target_path) except FileNotFoundError: oldstat = None contents = blob.as_raw_string() if stat.S_ISLNK(mode): if oldstat: _remove_file_with_readonly_handling(target_path) if sys.platform == "win32": # os.readlink on Python3 on Windows requires a unicode string. contents_str = contents.decode(tree_encoding) target_path_str = target_path.decode(tree_encoding) (symlink_fn or symlink)(contents_str, target_path_str) else: (symlink_fn or symlink)(contents, target_path) else: if oldstat is not None and oldstat.st_size == len(contents): with open(target_path, "rb") as f: if f.read() == contents: return oldstat with open(target_path, "wb") as f: # Write out file f.write(contents) if honor_filemode: os.chmod(target_path, mode) return os.lstat(target_path) INVALID_DOTNAMES = (b".git", b".", b"..", b"") def _normalize_path_element_default(element: bytes) -> bytes: """Normalize path element for default case-insensitive comparison.""" return element.lower() def _normalize_path_element_ntfs(element: bytes) -> bytes: """Normalize path element for NTFS filesystem.""" return element.rstrip(b". ").lower() def _normalize_path_element_hfs(element: bytes) -> bytes: """Normalize path element for HFS+ filesystem.""" import unicodedata # Decode to Unicode (let UnicodeDecodeError bubble up) element_str = element.decode("utf-8", errors="strict") # Remove HFS+ ignorable characters filtered = "".join(c for c in element_str if ord(c) not in HFS_IGNORABLE_CHARS) # Normalize to NFD normalized = unicodedata.normalize("NFD", filtered) return normalized.lower().encode("utf-8", errors="strict") def get_path_element_normalizer(config: "Config") -> Callable[[bytes], bytes]: """Get the appropriate path element normalization function based on config. Args: config: Repository configuration object Returns: Function that normalizes path elements for the configured filesystem """ import os import sys if config.get_boolean(b"core", b"protectNTFS", os.name == "nt"): return _normalize_path_element_ntfs elif config.get_boolean(b"core", b"protectHFS", sys.platform == "darwin"): return _normalize_path_element_hfs else: return _normalize_path_element_default def validate_path_element_default(element: bytes) -> bool: """Validate a path element using default rules. Args: element: Path element to validate Returns: True if path element is valid, False otherwise """ return _normalize_path_element_default(element) not in INVALID_DOTNAMES def validate_path_element_ntfs(element: bytes) -> bool: """Validate a path element using NTFS filesystem rules. Args: element: Path element to validate Returns: True if path element is valid for NTFS, False otherwise """ normalized = _normalize_path_element_ntfs(element) if normalized in INVALID_DOTNAMES: return False if normalized == b"git~1": return False return True # HFS+ ignorable Unicode codepoints (from Git's utf8.c) HFS_IGNORABLE_CHARS = { 0x200C, # ZERO WIDTH NON-JOINER 0x200D, # ZERO WIDTH JOINER 0x200E, # LEFT-TO-RIGHT MARK 0x200F, # RIGHT-TO-LEFT MARK 0x202A, # LEFT-TO-RIGHT EMBEDDING 0x202B, # RIGHT-TO-LEFT EMBEDDING 0x202C, # POP DIRECTIONAL FORMATTING 0x202D, # LEFT-TO-RIGHT OVERRIDE 0x202E, # RIGHT-TO-LEFT OVERRIDE 0x206A, # INHIBIT SYMMETRIC SWAPPING 0x206B, # ACTIVATE SYMMETRIC SWAPPING 0x206C, # INHIBIT ARABIC FORM SHAPING 0x206D, # ACTIVATE ARABIC FORM SHAPING 0x206E, # NATIONAL DIGIT SHAPES 0x206F, # NOMINAL DIGIT SHAPES 0xFEFF, # ZERO WIDTH NO-BREAK SPACE } def validate_path_element_hfs(element: bytes) -> bool: """Validate path element for HFS+ filesystem. Equivalent to Git's is_hfs_dotgit and related checks. Uses NFD normalization and ignores HFS+ ignorable characters. """ try: normalized = _normalize_path_element_hfs(element) except UnicodeDecodeError: # Malformed UTF-8 - be conservative and reject return False # Check against invalid names if normalized in INVALID_DOTNAMES: return False # Also check for 8.3 short name if normalized == b"git~1": return False return True def validate_path( path: bytes, element_validator: Callable[[bytes], bool] = validate_path_element_default, ) -> bool: """Default path validator that just checks for .git/.""" parts = path.split(b"/") for p in parts: if not element_validator(p): return False else: return True def build_index_from_tree( root_path: str | bytes, index_path: str | bytes, object_store: ObjectContainer, tree_id: ObjectID, honor_filemode: bool = True, validate_path_element: Callable[[bytes], bool] = validate_path_element_default, symlink_fn: Callable[ [str | bytes | os.PathLike[str], str | bytes | os.PathLike[str]], None ] | None = None, blob_normalizer: "FilterBlobNormalizer | None" = None, tree_encoding: str = "utf-8", ) -> None: """Generate and materialize index from a tree. Args: tree_id: Tree to materialize root_path: Target dir for materialized index files index_path: Target path for generated index object_store: Non-empty object store holding tree contents honor_filemode: An optional flag to honor core.filemode setting in config file, default is core.filemode=True, change executable bit validate_path_element: Function to validate path elements to check out; default just refuses .git and .. directories. symlink_fn: Function to use for creating symlinks blob_normalizer: An optional BlobNormalizer to use for converting line endings when writing blobs to the working directory. tree_encoding: Encoding used for tree paths (default: utf-8) Note: existing index is wiped and contents are not merged in a working dir. Suitable only for fresh clones. """ index = Index(index_path, read=False) if not isinstance(root_path, bytes): root_path = os.fsencode(root_path) for entry in iter_tree_contents(object_store, tree_id): assert ( entry.path is not None and entry.mode is not None and entry.sha is not None ) if not validate_path(entry.path, validate_path_element): continue full_path = _tree_to_fs_path(root_path, entry.path, tree_encoding) if not os.path.exists(os.path.dirname(full_path)): os.makedirs(os.path.dirname(full_path)) # TODO(jelmer): Merge new index into working tree if S_ISGITLINK(entry.mode): if not os.path.isdir(full_path): os.mkdir(full_path) st = os.lstat(full_path) # TODO(jelmer): record and return submodule paths else: obj = object_store[entry.sha] assert isinstance(obj, Blob) # Apply blob normalization for checkout if normalizer is provided if blob_normalizer is not None: obj = blob_normalizer.checkout_normalize(obj, entry.path) st = build_file_from_blob( obj, entry.mode, full_path, honor_filemode=honor_filemode, tree_encoding=tree_encoding, symlink_fn=symlink_fn, ) # Add file to index if not honor_filemode or S_ISGITLINK(entry.mode): # we can not use tuple slicing to build a new tuple, # because on windows that will convert the times to # longs, which causes errors further along st_tuple = ( entry.mode, st.st_ino, st.st_dev, st.st_nlink, st.st_uid, st.st_gid, st.st_size, st.st_atime, st.st_mtime, st.st_ctime, ) st = st.__class__(st_tuple) # default to a stage 0 index entry (normal) # when reading from the filesystem index[entry.path] = index_entry_from_stat(st, entry.sha) index.write() def blob_from_path_and_mode( fs_path: bytes, mode: int, tree_encoding: str = "utf-8" ) -> Blob: """Create a blob from a path and a stat object. Args: fs_path: Full file system path to file mode: File mode tree_encoding: Encoding to use for tree contents Returns: A `Blob` object """ assert isinstance(fs_path, bytes) blob = Blob() if stat.S_ISLNK(mode): if sys.platform == "win32": # os.readlink on Python3 on Windows requires a unicode string. blob.data = os.readlink(os.fsdecode(fs_path)).encode(tree_encoding) else: blob.data = os.readlink(fs_path) else: with open(fs_path, "rb") as f: blob.data = f.read() return blob def blob_from_path_and_stat( fs_path: bytes, st: os.stat_result, tree_encoding: str = "utf-8" ) -> Blob: """Create a blob from a path and a stat object. Args: fs_path: Full file system path to file st: A stat object tree_encoding: Encoding to use for tree contents Returns: A `Blob` object """ return blob_from_path_and_mode(fs_path, st.st_mode, tree_encoding) def read_submodule_head(path: str | bytes) -> bytes | None: """Read the head commit of a submodule. Args: path: path to the submodule Returns: HEAD sha, None if not a valid head/repository """ from .errors import NotGitRepository from .repo import Repo # Repo currently expects a "str", so decode if necessary. # TODO(jelmer): Perhaps move this into Repo() ? if not isinstance(path, str): path = os.fsdecode(path) try: repo = Repo(path) except NotGitRepository: return None try: return repo.head() except KeyError: return None def _has_directory_changed(tree_path: bytes, entry: IndexEntry) -> bool: """Check if a directory has changed after getting an error. When handling an error trying to create a blob from a path, call this function. It will check if the path is a directory. If it's a directory and a submodule, check the submodule head to see if it's has changed. If not, consider the file as changed as Git tracked a file and not a directory. Return true if the given path should be considered as changed and False otherwise or if the path is not a directory. """ # This is actually a directory if os.path.exists(os.path.join(tree_path, b".git")): # Submodule head = read_submodule_head(tree_path) if entry.sha != head: return True else: # The file was changed to a directory, so consider it removed. return True return False os_sep_bytes = os.sep.encode("ascii") def _ensure_parent_dir_exists(full_path: bytes) -> None: """Ensure parent directory exists, checking no parent is a file.""" parent_dir = os.path.dirname(full_path) if parent_dir and not os.path.exists(parent_dir): # Walk up the directory tree to find the first existing parent current = parent_dir parents_to_check: list[bytes] = [] while current and not os.path.exists(current): parents_to_check.insert(0, current) new_parent = os.path.dirname(current) if new_parent == current: # Reached the root or can't go up further break current = new_parent # Check if the existing parent (if any) is a directory if current and os.path.exists(current) and not os.path.isdir(current): raise OSError( f"Cannot create directory, parent path is a file: {current!r}" ) # Now check each parent we need to create isn't blocked by an existing file for parent_path in parents_to_check: if os.path.exists(parent_path) and not os.path.isdir(parent_path): raise OSError( f"Cannot create directory, parent path is a file: {parent_path!r}" ) os.makedirs(parent_dir) def _remove_file_with_readonly_handling(path: bytes) -> None: """Remove a file, handling read-only files on Windows. Args: path: Path to the file to remove """ try: os.unlink(path) except PermissionError: # On Windows, remove read-only attribute and retry if sys.platform == "win32": os.chmod(path, stat.S_IWRITE | stat.S_IREAD) os.unlink(path) else: raise def _remove_empty_parents(path: bytes, stop_at: bytes) -> None: """Remove empty parent directories up to stop_at.""" parent = os.path.dirname(path) while parent and parent != stop_at: try: os.rmdir(parent) parent = os.path.dirname(parent) except FileNotFoundError: # Directory doesn't exist - stop trying break except OSError as e: if e.errno in (errno.ENOTEMPTY, errno.EEXIST): # Directory not empty - stop trying break raise def _check_symlink_matches( full_path: bytes, repo_object_store: "BaseObjectStore", entry_sha: ObjectID ) -> bool: """Check if symlink target matches expected target. Returns True if symlink matches, False if it doesn't match. """ try: current_target = os.readlink(full_path) blob_obj = repo_object_store[entry_sha] expected_target = blob_obj.as_raw_string() if isinstance(current_target, str): current_target = current_target.encode() return current_target == expected_target except FileNotFoundError: # Symlink doesn't exist return False except OSError as e: if e.errno == errno.EINVAL: # Not a symlink return False raise def _check_file_matches( repo_object_store: "BaseObjectStore", full_path: bytes, entry_sha: ObjectID, entry_mode: int, current_stat: os.stat_result, honor_filemode: bool, blob_normalizer: "FilterBlobNormalizer | None" = None, tree_path: bytes | None = None, ) -> bool: """Check if a file on disk matches the expected git object. Returns True if file matches, False if it doesn't match. """ # Check mode first (if honor_filemode is True) if honor_filemode: current_mode = stat.S_IMODE(current_stat.st_mode) expected_mode = stat.S_IMODE(entry_mode) # For regular files, only check the user executable bit, not group/other permissions # This matches Git's behavior where umask differences don't count as modifications if stat.S_ISREG(current_stat.st_mode): # Normalize regular file modes to ignore group/other write permissions current_mode_normalized = ( current_mode & 0o755 ) # Keep only user rwx and all read+execute expected_mode_normalized = expected_mode & 0o755 # For Git compatibility, regular files should be either 644 or 755 if expected_mode_normalized not in (0o644, 0o755): expected_mode_normalized = 0o644 # Default for regular files if current_mode_normalized not in (0o644, 0o755): # Determine if it should be executable based on user execute bit if current_mode & 0o100: # User execute bit is set current_mode_normalized = 0o755 else: current_mode_normalized = 0o644 if current_mode_normalized != expected_mode_normalized: return False else: # For non-regular files (symlinks, etc.), check mode exactly if current_mode != expected_mode: return False # If mode matches (or we don't care), check content via size first blob_obj = repo_object_store[entry_sha] if current_stat.st_size != blob_obj.raw_length(): return False # Size matches, check actual content try: with open(full_path, "rb") as f: current_content = f.read() expected_content = blob_obj.as_raw_string() if blob_normalizer and tree_path is not None: assert isinstance(blob_obj, Blob) normalized_blob = blob_normalizer.checkout_normalize( blob_obj, tree_path ) expected_content = normalized_blob.as_raw_string() return current_content == expected_content except (FileNotFoundError, PermissionError, IsADirectoryError): return False def _transition_to_submodule( repo: "Repo", path: bytes, full_path: bytes, current_stat: os.stat_result | None, entry: IndexEntry | TreeEntry, index: Index, ) -> None: """Transition any type to submodule.""" from .submodule import ensure_submodule_placeholder if current_stat is not None and stat.S_ISDIR(current_stat.st_mode): # Already a directory, just ensure .git file exists ensure_submodule_placeholder(repo, path) else: # Remove whatever is there and create submodule if current_stat is not None: _remove_file_with_readonly_handling(full_path) ensure_submodule_placeholder(repo, path) st = os.lstat(full_path) assert entry.sha is not None index[path] = index_entry_from_stat(st, entry.sha) def _transition_to_file( object_store: "BaseObjectStore", path: bytes, full_path: bytes, current_stat: os.stat_result | None, entry: IndexEntry | TreeEntry, index: Index, honor_filemode: bool, symlink_fn: Callable[ [str | bytes | os.PathLike[str], str | bytes | os.PathLike[str]], None ] | None, blob_normalizer: "FilterBlobNormalizer | None", tree_encoding: str = "utf-8", ) -> None: """Transition any type to regular file or symlink.""" assert entry.sha is not None and entry.mode is not None # Check if we need to update if ( current_stat is not None and stat.S_ISREG(current_stat.st_mode) and not stat.S_ISLNK(entry.mode) ): # File to file - check if update needed file_matches = _check_file_matches( object_store, full_path, entry.sha, entry.mode, current_stat, honor_filemode, blob_normalizer, path, ) needs_update = not file_matches elif ( current_stat is not None and stat.S_ISLNK(current_stat.st_mode) and stat.S_ISLNK(entry.mode) ): # Symlink to symlink - check if update needed symlink_matches = _check_symlink_matches(full_path, object_store, entry.sha) needs_update = not symlink_matches else: needs_update = True if not needs_update: # Just update index - current_stat should always be valid here since we're not updating assert current_stat is not None index[path] = index_entry_from_stat(current_stat, entry.sha) return # Remove existing entry if needed if current_stat is not None and stat.S_ISDIR(current_stat.st_mode): # Remove directory dir_contents = set(os.listdir(full_path)) git_file_name = b".git" if isinstance(full_path, bytes) else ".git" if git_file_name in dir_contents: if dir_contents != {git_file_name}: raise IsADirectoryError( f"Cannot replace submodule with untracked files: {full_path!r}" ) shutil.rmtree(full_path) else: try: os.rmdir(full_path) except OSError as e: if e.errno in (errno.ENOTEMPTY, errno.EEXIST): raise IsADirectoryError( f"Cannot replace non-empty directory with file: {full_path!r}" ) raise elif current_stat is not None: _remove_file_with_readonly_handling(full_path) # Ensure parent directory exists _ensure_parent_dir_exists(full_path) # Write the file blob_obj = object_store[entry.sha] assert isinstance(blob_obj, Blob) if blob_normalizer: blob_obj = blob_normalizer.checkout_normalize(blob_obj, path) st = build_file_from_blob( blob_obj, entry.mode, full_path, honor_filemode=honor_filemode, tree_encoding=tree_encoding, symlink_fn=symlink_fn, ) index[path] = index_entry_from_stat(st, entry.sha) def _transition_to_absent( repo: "Repo", path: bytes, full_path: bytes, current_stat: os.stat_result | None, index: Index, ) -> None: """Remove any type of entry.""" if current_stat is None: return if stat.S_ISDIR(current_stat.st_mode): # Check if it's a submodule directory dir_contents = set(os.listdir(full_path)) git_file_name = b".git" if isinstance(full_path, bytes) else ".git" if git_file_name in dir_contents and dir_contents == {git_file_name}: shutil.rmtree(full_path) else: try: os.rmdir(full_path) except OSError as e: if e.errno not in (errno.ENOTEMPTY, errno.EEXIST): raise else: _remove_file_with_readonly_handling(full_path) try: del index[path] except KeyError: pass # Try to remove empty parent directories _remove_empty_parents( full_path, repo.path if isinstance(repo.path, bytes) else repo.path.encode() ) def detect_case_only_renames( changes: Sequence["TreeChange"], config: "Config", ) -> list["TreeChange"]: """Detect and transform case-only renames in a list of tree changes. This function identifies file renames that only differ in case (e.g., README.txt -> readme.txt) and transforms matching ADD/DELETE pairs into CHANGE_RENAME operations. It uses filesystem-appropriate path normalization based on the repository configuration. Args: changes: List of TreeChange objects representing file changes config: Repository configuration object Returns: New list of TreeChange objects with case-only renames converted to CHANGE_RENAME """ from .diff_tree import ( CHANGE_ADD, CHANGE_COPY, CHANGE_DELETE, CHANGE_MODIFY, CHANGE_RENAME, TreeChange, ) # Build dictionaries of old and new paths with their normalized forms old_paths_normalized = {} new_paths_normalized = {} old_changes = {} # Map from old path to change object new_changes = {} # Map from new path to change object # Get the appropriate normalizer based on config normalize_func = get_path_element_normalizer(config) def normalize_path(path: bytes) -> bytes: """Normalize entire path using element normalization.""" return b"/".join(normalize_func(part) for part in path.split(b"/")) # Pre-normalize all paths once to avoid repeated normalization for change in changes: if change.type == CHANGE_DELETE and change.old: assert change.old.path is not None try: normalized = normalize_path(change.old.path) except UnicodeDecodeError: import logging logging.warning( "Skipping case-only rename detection for path with invalid UTF-8: %r", change.old.path, ) else: old_paths_normalized[normalized] = change.old.path old_changes[change.old.path] = change elif change.type == CHANGE_RENAME and change.old: assert change.old.path is not None # Treat RENAME as DELETE + ADD for case-only detection try: normalized = normalize_path(change.old.path) except UnicodeDecodeError: import logging logging.warning( "Skipping case-only rename detection for path with invalid UTF-8: %r", change.old.path, ) else: old_paths_normalized[normalized] = change.old.path old_changes[change.old.path] = change if ( change.type in (CHANGE_ADD, CHANGE_MODIFY, CHANGE_RENAME, CHANGE_COPY) and change.new ): assert change.new.path is not None try: normalized = normalize_path(change.new.path) except UnicodeDecodeError: import logging logging.warning( "Skipping case-only rename detection for path with invalid UTF-8: %r", change.new.path, ) else: new_paths_normalized[normalized] = change.new.path new_changes[change.new.path] = change # Find case-only renames and transform changes case_only_renames = set() new_rename_changes = [] for norm_path, old_path in old_paths_normalized.items(): if norm_path in new_paths_normalized: new_path = new_paths_normalized[norm_path] if old_path != new_path: # Found a case-only rename old_change = old_changes[old_path] new_change = new_changes[new_path] # Create a CHANGE_RENAME to replace the DELETE and ADD/MODIFY pair if new_change.type == CHANGE_ADD: # Simple case: DELETE + ADD becomes RENAME rename_change = TreeChange( CHANGE_RENAME, old_change.old, new_change.new ) else: # Complex case: DELETE + MODIFY becomes RENAME # Use the old file from DELETE and new file from MODIFY rename_change = TreeChange( CHANGE_RENAME, old_change.old, new_change.new ) new_rename_changes.append(rename_change) # Mark the old changes for removal case_only_renames.add(old_change) case_only_renames.add(new_change) # Return new list with original ADD/DELETE changes replaced by renames result = [change for change in changes if change not in case_only_renames] result.extend(new_rename_changes) return result def update_working_tree( repo: "Repo", old_tree_id: bytes | None, new_tree_id: bytes, change_iterator: Iterator["TreeChange"], honor_filemode: bool = True, validate_path_element: Callable[[bytes], bool] | None = None, symlink_fn: Callable[ [str | bytes | os.PathLike[str], str | bytes | os.PathLike[str]], None ] | None = None, force_remove_untracked: bool = False, blob_normalizer: "FilterBlobNormalizer | None" = None, tree_encoding: str = "utf-8", allow_overwrite_modified: bool = False, ) -> None: """Update the working tree and index to match a new tree. This function handles: - Adding new files - Updating modified files - Removing deleted files - Cleaning up empty directories Args: repo: Repository object old_tree_id: SHA of the tree before the update new_tree_id: SHA of the tree to update to change_iterator: Iterator of TreeChange objects to apply honor_filemode: An optional flag to honor core.filemode setting validate_path_element: Function to validate path elements to check out symlink_fn: Function to use for creating symlinks force_remove_untracked: If True, remove files that exist in working directory but not in target tree, even if old_tree_id is None blob_normalizer: An optional BlobNormalizer to use for converting line endings when writing blobs to the working directory. tree_encoding: Encoding used for tree paths (default: utf-8) allow_overwrite_modified: If False, raise an error when attempting to overwrite files that have been modified compared to old_tree_id """ if validate_path_element is None: validate_path_element = validate_path_element_default from .diff_tree import ( CHANGE_ADD, CHANGE_COPY, CHANGE_DELETE, CHANGE_MODIFY, CHANGE_RENAME, CHANGE_UNCHANGED, ) repo_path = repo.path if isinstance(repo.path, bytes) else repo.path.encode() index = repo.open_index() # Convert iterator to list since we need multiple passes changes = list(change_iterator) # Transform case-only renames on case-insensitive filesystems import platform default_ignore_case = platform.system() in ("Windows", "Darwin") config = repo.get_config() ignore_case = config.get_boolean((b"core",), b"ignorecase", default_ignore_case) if ignore_case: config = repo.get_config() changes = detect_case_only_renames(changes, config) # Check for path conflicts where files need to become directories paths_becoming_dirs = set() for change in changes: if change.type in (CHANGE_ADD, CHANGE_MODIFY, CHANGE_RENAME, CHANGE_COPY): assert change.new is not None path = change.new.path assert path is not None if b"/" in path: # This is a file inside a directory # Check if any parent path exists as a file in the old tree or changes parts = path.split(b"/") for i in range(1, len(parts)): parent = b"/".join(parts[:i]) # See if this parent path is being deleted (was a file, becoming a dir) for other_change in changes: if ( other_change.type == CHANGE_DELETE and other_change.old and other_change.old.path == parent ): paths_becoming_dirs.add(parent) # Check if any path that needs to become a directory has been modified for path in paths_becoming_dirs: full_path = _tree_to_fs_path(repo_path, path, tree_encoding) try: current_stat = os.lstat(full_path) except FileNotFoundError: continue # File doesn't exist, nothing to check except OSError as e: raise OSError( f"Cannot access {path.decode('utf-8', errors='replace')}: {e}" ) from e if stat.S_ISREG(current_stat.st_mode): # Find the old entry for this path old_change = None for change in changes: if ( change.type == CHANGE_DELETE and change.old and change.old.path == path ): old_change = change break if old_change: # Check if file has been modified assert old_change.old is not None assert ( old_change.old.sha is not None and old_change.old.mode is not None ) file_matches = _check_file_matches( repo.object_store, full_path, old_change.old.sha, old_change.old.mode, current_stat, honor_filemode, blob_normalizer, path, ) if not file_matches: raise OSError( f"Cannot replace modified file with directory: {path!r}" ) # Check for uncommitted modifications before making any changes if not allow_overwrite_modified and old_tree_id: for change in changes: # Only check files that are being modified or deleted if change.type in (CHANGE_MODIFY, CHANGE_DELETE) and change.old: path = change.old.path assert path is not None if path.startswith(b".git") or not validate_path( path, validate_path_element ): continue full_path = _tree_to_fs_path(repo_path, path, tree_encoding) try: current_stat = os.lstat(full_path) except FileNotFoundError: continue # File doesn't exist, nothing to check except OSError as e: raise OSError( f"Cannot access {path.decode('utf-8', errors='replace')}: {e}" ) from e if stat.S_ISREG(current_stat.st_mode): # Check if working tree file differs from old tree assert change.old.sha is not None and change.old.mode is not None file_matches = _check_file_matches( repo.object_store, full_path, change.old.sha, change.old.mode, current_stat, honor_filemode, blob_normalizer, path, ) if not file_matches: from .errors import WorkingTreeModifiedError raise WorkingTreeModifiedError( f"Your local changes to '{path.decode('utf-8', errors='replace')}' " f"would be overwritten by checkout. " f"Please commit your changes or stash them before you switch branches." ) # Apply the changes for change in changes: if change.type in (CHANGE_DELETE, CHANGE_RENAME): # Remove file/directory assert change.old is not None and change.old.path is not None path = change.old.path if path.startswith(b".git") or not validate_path( path, validate_path_element ): continue full_path = _tree_to_fs_path(repo_path, path, tree_encoding) try: delete_stat: os.stat_result | None = os.lstat(full_path) except FileNotFoundError: delete_stat = None except OSError as e: raise OSError( f"Cannot access {path.decode('utf-8', errors='replace')}: {e}" ) from e _transition_to_absent(repo, path, full_path, delete_stat, index) if change.type in ( CHANGE_ADD, CHANGE_MODIFY, CHANGE_UNCHANGED, CHANGE_COPY, CHANGE_RENAME, ): # Add or modify file assert ( change.new is not None and change.new.path is not None and change.new.mode is not None ) path = change.new.path if path.startswith(b".git") or not validate_path( path, validate_path_element ): continue full_path = _tree_to_fs_path(repo_path, path, tree_encoding) try: modify_stat: os.stat_result | None = os.lstat(full_path) except FileNotFoundError: modify_stat = None except OSError as e: raise OSError( f"Cannot access {path.decode('utf-8', errors='replace')}: {e}" ) from e if S_ISGITLINK(change.new.mode): _transition_to_submodule( repo, path, full_path, modify_stat, change.new, index ) else: _transition_to_file( repo.object_store, path, full_path, modify_stat, change.new, index, honor_filemode, symlink_fn, blob_normalizer, tree_encoding, ) index.write() def _stat_matches_entry(st: os.stat_result, entry: IndexEntry) -> bool: """Check if filesystem stat matches index entry stat. This is used to determine if a file might have changed without reading its content. Git uses this optimization to avoid expensive filter operations on unchanged files. Args: st: Filesystem stat result entry: Index entry to compare against Returns: True if stat matches and file is likely unchanged """ # Get entry mtime with nanosecond precision if available if isinstance(entry.mtime, tuple): entry_mtime_sec = entry.mtime[0] entry_mtime_nsec = entry.mtime[1] else: entry_mtime_sec = int(entry.mtime) entry_mtime_nsec = 0 # Compare modification time with nanosecond precision if available # This is important for fast workflows (e.g., stash) where files can be # modified multiple times within the same second if hasattr(st, "st_mtime_ns"): # Use nanosecond precision when available st_mtime_nsec = st.st_mtime_ns entry_mtime_nsec_total = entry_mtime_sec * 1_000_000_000 + entry_mtime_nsec if st_mtime_nsec != entry_mtime_nsec_total: return False else: # Fall back to second precision if int(st.st_mtime) != entry_mtime_sec: return False # Compare file size if st.st_size != entry.size: return False # If both mtime and size match, file is likely unchanged return True def _check_entry_for_changes( tree_path: bytes, entry: IndexEntry | ConflictedIndexEntry, root_path: bytes, filter_blob_callback: Callable[[Blob, bytes], Blob] | None = None, ) -> bytes | None: """Check a single index entry for changes. Args: tree_path: Path in the tree entry: Index entry to check root_path: Root filesystem path filter_blob_callback: Optional callback to filter blobs Returns: tree_path if changed, None otherwise """ if isinstance(entry, ConflictedIndexEntry): # Conflicted files are always unstaged return tree_path full_path = _tree_to_fs_path(root_path, tree_path) try: st = os.lstat(full_path) if stat.S_ISDIR(st.st_mode): if _has_directory_changed(tree_path, entry): return tree_path return None if not stat.S_ISREG(st.st_mode) and not stat.S_ISLNK(st.st_mode): return None # Optimization: If stat matches index entry (mtime and size unchanged), # we can skip reading and filtering the file entirely. This is a significant # performance improvement for repositories with many unchanged files. # Even with filters (e.g., LFS), if the file hasn't been modified (stat unchanged), # the filter output would be the same, so we can safely skip the expensive # filter operation. This addresses performance issues with LFS repositories # where filter operations can be very slow. if _stat_matches_entry(st, entry): return None blob = blob_from_path_and_stat(full_path, st) if filter_blob_callback is not None: blob = filter_blob_callback(blob, tree_path) except FileNotFoundError: # The file was removed, so we assume that counts as # different from whatever file used to exist. return tree_path else: if blob.id != entry.sha: return tree_path return None def get_unstaged_changes( index: Index, root_path: str | bytes, filter_blob_callback: Callable[..., Any] | None = None, preload_index: bool = False, ) -> Generator[bytes, None, None]: """Walk through an index and check for differences against working tree. Args: index: index to check root_path: path in which to find files filter_blob_callback: Optional callback to filter blobs preload_index: If True, use parallel threads to check files (requires threading support) Returns: iterator over paths with unstaged changes """ # For each entry in the index check the sha1 & ensure not staged if not isinstance(root_path, bytes): root_path = os.fsencode(root_path) if preload_index: # Use parallel processing for better performance on slow filesystems try: import multiprocessing from concurrent.futures import ThreadPoolExecutor except ImportError: # If threading is not available, fall back to serial processing preload_index = False else: # Collect all entries first entries = list(index.iteritems()) # Use number of CPUs but cap at 8 threads to avoid overhead num_workers = min(multiprocessing.cpu_count(), 8) # Process entries in parallel with ThreadPoolExecutor(max_workers=num_workers) as executor: # Submit all tasks futures = [ executor.submit( _check_entry_for_changes, tree_path, entry, root_path, filter_blob_callback, ) for tree_path, entry in entries ] # Yield results as they complete for future in futures: result = future.result() if result is not None: yield result if not preload_index: # Serial processing for tree_path, entry in index.iteritems(): result = _check_entry_for_changes( tree_path, entry, root_path, filter_blob_callback ) if result is not None: yield result def _tree_to_fs_path( root_path: bytes, tree_path: bytes, tree_encoding: str = "utf-8" ) -> bytes: """Convert a git tree path to a file system path. Args: root_path: Root filesystem path tree_path: Git tree path as bytes (encoded with tree_encoding) tree_encoding: Encoding used for tree paths (default: utf-8) Returns: File system path. """ assert isinstance(tree_path, bytes) if os_sep_bytes != b"/": sep_corrected_path = tree_path.replace(b"/", os_sep_bytes) else: sep_corrected_path = tree_path # On Windows, we need to handle tree path encoding properly if sys.platform == "win32": # Decode from tree encoding, then re-encode for filesystem try: tree_path_str = sep_corrected_path.decode(tree_encoding) sep_corrected_path = os.fsencode(tree_path_str) except UnicodeDecodeError: # If decoding fails, use the original bytes pass return os.path.join(root_path, sep_corrected_path) def _fs_to_tree_path(fs_path: str | bytes, tree_encoding: str = "utf-8") -> bytes: """Convert a file system path to a git tree path. Args: fs_path: File system path. tree_encoding: Encoding to use for tree paths (default: utf-8) Returns: Git tree path as bytes (encoded with tree_encoding) """ if not isinstance(fs_path, bytes): fs_path_bytes = os.fsencode(fs_path) else: fs_path_bytes = fs_path # On Windows, we need to ensure tree paths are properly encoded if sys.platform == "win32": try: # Decode from filesystem encoding, then re-encode with tree encoding fs_path_str = os.fsdecode(fs_path_bytes) fs_path_bytes = fs_path_str.encode(tree_encoding) except UnicodeDecodeError: # If filesystem decoding fails, use the original bytes pass if os_sep_bytes != b"/": tree_path = fs_path_bytes.replace(os_sep_bytes, b"/") else: tree_path = fs_path_bytes return tree_path def index_entry_from_directory(st: os.stat_result, path: bytes) -> IndexEntry | None: """Create an index entry for a directory. This is only used for submodules (directories containing .git). Args: st: Stat result for the directory path: Path to the directory Returns: IndexEntry for a submodule, or None if not a submodule """ if os.path.exists(os.path.join(path, b".git")): head = read_submodule_head(path) if head is None: return None return index_entry_from_stat(st, head, mode=S_IFGITLINK) return None def index_entry_from_path( path: bytes, object_store: ObjectContainer | None = None ) -> IndexEntry | None: """Create an index from a filesystem path. This returns an index value for files, symlinks and tree references. for directories and non-existent files it returns None Args: path: Path to create an index entry for object_store: Optional object store to save new blobs in Returns: An index entry; None for directories """ assert isinstance(path, bytes) st = os.lstat(path) if stat.S_ISDIR(st.st_mode): return index_entry_from_directory(st, path) if stat.S_ISREG(st.st_mode) or stat.S_ISLNK(st.st_mode): blob = blob_from_path_and_stat(path, st) if object_store is not None: object_store.add_object(blob) return index_entry_from_stat(st, blob.id) return None def iter_fresh_entries( paths: Iterable[bytes], root_path: bytes, object_store: ObjectContainer | None = None, ) -> Iterator[tuple[bytes, IndexEntry | None]]: """Iterate over current versions of index entries on disk. Args: paths: Paths to iterate over root_path: Root path to access from object_store: Optional store to save new blobs in Returns: Iterator over path, index_entry """ for path in paths: p = _tree_to_fs_path(root_path, path) try: entry = index_entry_from_path(p, object_store=object_store) except (FileNotFoundError, IsADirectoryError): entry = None yield path, entry def iter_fresh_objects( paths: Iterable[bytes], root_path: bytes, include_deleted: bool = False, object_store: ObjectContainer | None = None, ) -> Iterator[tuple[bytes, ObjectID | None, int | None]]: """Iterate over versions of objects on disk referenced by index. Args: paths: Paths to check root_path: Root path to access from include_deleted: Include deleted entries with sha and mode set to None object_store: Optional object store to report new items to Returns: Iterator over path, sha, mode """ for path, entry in iter_fresh_entries(paths, root_path, object_store=object_store): if entry is None: if include_deleted: yield path, None, None else: yield path, entry.sha, cleanup_mode(entry.mode) def refresh_index(index: Index, root_path: bytes) -> None: """Refresh the contents of an index. This is the equivalent to running 'git commit -a'. Args: index: Index to update root_path: Root filesystem path """ for path, entry in iter_fresh_entries(index, root_path): if entry: index[path] = entry class locked_index: """Lock the index while making modifications. Works as a context manager. """ _file: "_GitFile" def __init__(self, path: bytes | str) -> None: """Initialize locked_index.""" self._path = path def __enter__(self) -> Index: """Enter context manager and lock index.""" f = GitFile(self._path, "wb") self._file = f self._index = Index(self._path) return self._index def __exit__( self, exc_type: type | None, exc_value: BaseException | None, traceback: types.TracebackType | None, ) -> None: """Exit context manager and unlock index.""" if exc_type is not None: self._file.abort() return try: f = SHA1Writer(self._file) write_index_dict(f, self._index._byname) except BaseException: self._file.abort() else: f.close() dulwich-1.0.0/dulwich/lfs.py000066400000000000000000000627551513301442600157620ustar00rootroot00000000000000# lfs.py -- Implementation of the LFS # Copyright (C) 2020 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git Large File Storage (LFS) support. This module provides support for Git LFS, which is a Git extension for versioning large files. It replaces large files with text pointers inside Git, while storing the file contents on a remote server. Key components: - LFS pointer file parsing and creation - LFS object storage and retrieval - HTTP client for LFS server communication - Integration with dulwich repositories """ __all__ = [ "FileLFSClient", "HTTPLFSClient", "LFSAction", "LFSBatchObject", "LFSBatchResponse", "LFSClient", "LFSError", "LFSErrorInfo", "LFSFilterDriver", "LFSPointer", "LFSStore", ] import hashlib import json import logging import os import tempfile from collections.abc import Iterable, Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Any, BinaryIO from urllib.parse import urljoin, urlparse from urllib.request import Request, urlopen logger = logging.getLogger(__name__) if TYPE_CHECKING: import urllib3 from .client import AuthCallbackPoolManager from .config import Config from .repo import Repo @dataclass class LFSAction: """LFS action structure.""" href: str header: dict[str, str] | None = None expires_at: str | None = None @dataclass class LFSErrorInfo: """LFS error structure.""" code: int message: str @dataclass class LFSBatchObject: """LFS batch object structure.""" oid: str size: int authenticated: bool | None = None actions: dict[str, LFSAction] | None = None error: LFSErrorInfo | None = None @dataclass class LFSBatchResponse: """LFS batch response structure.""" transfer: str objects: list[LFSBatchObject] hash_algo: str | None = None class LFSStore: """Stores objects on disk, indexed by SHA256.""" def __init__(self, path: str) -> None: """Initialize LFSStore.""" self.path = path @classmethod def create(cls, lfs_dir: str) -> "LFSStore": """Create a new LFS store.""" if not os.path.isdir(lfs_dir): os.mkdir(lfs_dir) tmp_dir = os.path.join(lfs_dir, "tmp") if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) objects_dir = os.path.join(lfs_dir, "objects") if not os.path.isdir(objects_dir): os.mkdir(objects_dir) return cls(lfs_dir) @classmethod def from_repo(cls, repo: "Repo", create: bool = False) -> "LFSStore": """Create LFS store from repository.""" lfs_dir = os.path.join(repo.controldir(), "lfs") if create: return cls.create(lfs_dir) return cls(lfs_dir) @classmethod def from_controldir(cls, controldir: str, create: bool = False) -> "LFSStore": """Create LFS store from control directory.""" lfs_dir = os.path.join(controldir, "lfs") if create: return cls.create(lfs_dir) return cls(lfs_dir) def _sha_path(self, sha: str) -> str: return os.path.join(self.path, "objects", sha[0:2], sha[2:4], sha) def open_object(self, sha: str) -> BinaryIO: """Open an object by sha.""" try: return open(self._sha_path(sha), "rb") except FileNotFoundError as exc: raise KeyError(sha) from exc def write_object(self, chunks: Iterable[bytes]) -> str: """Write an object. Returns: object SHA """ # First pass: compute SHA256 and collect data sha = hashlib.sha256() data_chunks = [] for chunk in chunks: sha.update(chunk) data_chunks.append(chunk) sha_hex = sha.hexdigest() path = self._sha_path(sha_hex) # If object already exists, no need to write if os.path.exists(path): return sha_hex # Object doesn't exist, write it if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) tmpdir = os.path.join(self.path, "tmp") with tempfile.NamedTemporaryFile(dir=tmpdir, mode="wb", delete=False) as f: for chunk in data_chunks: f.write(chunk) f.flush() tmppath = f.name # Handle concurrent writes - if file already exists, just remove temp file if os.path.exists(path): os.remove(tmppath) else: os.rename(tmppath, path) return sha_hex class LFSPointer: """Represents an LFS pointer file.""" def __init__(self, oid: str, size: int) -> None: """Initialize LFSPointer.""" self.oid = oid self.size = size @classmethod def from_bytes(cls, data: bytes) -> "LFSPointer | None": """Parse LFS pointer from bytes. Returns None if data is not a valid LFS pointer. """ try: text = data.decode("utf-8") except UnicodeDecodeError: return None # LFS pointer files have a specific format lines = text.strip().split("\n") if len(lines) < 3: return None # Must start with version if not lines[0].startswith("version https://git-lfs.github.com/spec/v1"): return None oid = None size = None for line in lines[1:]: if line.startswith("oid sha256:"): oid = line[11:].strip() elif line.startswith("size "): try: size = int(line[5:].strip()) # Size must be non-negative if size < 0: return None except ValueError: return None if oid is None or size is None: return None return cls(oid, size) def to_bytes(self) -> bytes: """Convert LFS pointer to bytes.""" return ( f"version https://git-lfs.github.com/spec/v1\n" f"oid sha256:{self.oid}\n" f"size {self.size}\n" ).encode() def is_valid_oid(self) -> bool: """Check if the OID is valid SHA256.""" if len(self.oid) != 64: return False try: int(self.oid, 16) return True except ValueError: return False class LFSFilterDriver: """LFS filter driver implementation.""" def __init__(self, lfs_store: "LFSStore", config: "Config | None" = None) -> None: """Initialize LFSFilterDriver.""" self.lfs_store = lfs_store self.config = config def clean(self, data: bytes) -> bytes: """Convert file content to LFS pointer (clean filter).""" # Check if data is already an LFS pointer pointer = LFSPointer.from_bytes(data) if pointer is not None: return data # Store the file content in LFS sha = self.lfs_store.write_object([data]) # Create and return LFS pointer pointer = LFSPointer(sha, len(data)) return pointer.to_bytes() def smudge(self, data: bytes, path: bytes = b"") -> bytes: """Convert LFS pointer to file content (smudge filter).""" # Try to parse as LFS pointer pointer = LFSPointer.from_bytes(data) if pointer is None: # Not an LFS pointer, return as-is return data # Validate the pointer if not pointer.is_valid_oid(): return data try: # Read the actual content from LFS store with self.lfs_store.open_object(pointer.oid) as f: return f.read() except KeyError: # Object not found in LFS store, try to download it try: content = self._download_object(pointer) return content except LFSError as e: # Download failed, fall back to returning pointer logger.warning("LFS object download failed for %s: %s", pointer.oid, e) # Return pointer as-is when object is missing and download failed return data def _download_object(self, pointer: LFSPointer) -> bytes: """Download an LFS object from the server. Args: pointer: LFS pointer containing OID and size Returns: Downloaded content Raises: LFSError: If download fails for any reason """ if self.config is None: raise LFSError("No configuration available for LFS download") # Create LFS client and download client = LFSClient.from_config(self.config) if client is None: raise LFSError("No LFS client available from configuration") content = client.download(pointer.oid, pointer.size) # Store the downloaded content in local LFS store stored_oid = self.lfs_store.write_object([content]) # Verify the stored OID matches what we expected if stored_oid != pointer.oid: raise LFSError( f"Downloaded OID mismatch: expected {pointer.oid}, got {stored_oid}" ) return content def cleanup(self) -> None: """Clean up any resources held by this filter driver.""" # LFSFilterDriver doesn't hold any resources that need cleanup def reuse(self, config: "Config | None", filter_name: str) -> bool: """Check if this filter driver should be reused with the given configuration.""" # LFSFilterDriver is stateless and lightweight, no need to cache return False def _get_lfs_user_agent(config: "Config | None") -> str: """Get User-Agent string for LFS requests, respecting git config.""" try: if config: # Use configured user agent verbatim if set return config.get(b"http", b"useragent").decode() except KeyError: pass # Default LFS user agent (similar to git-lfs format) from . import __version__ version_str = ".".join([str(x) for x in __version__]) return f"git-lfs/dulwich/{version_str}" def _is_valid_lfs_url(url: str) -> bool: """Check if a URL is valid for LFS. Git LFS supports http://, https://, and file:// URLs. Args: url: URL to validate Returns: True if URL is a valid LFS URL, False otherwise """ parsed = urlparse(url) # Must have a scheme if not parsed.scheme: return False # Only support http, https, and file schemes if parsed.scheme not in ("http", "https", "file"): return False # http/https require a hostname if parsed.scheme in ("http", "https"): return bool(parsed.netloc) # file:// URLs must have a path (netloc is typically empty) if parsed.scheme == "file": return bool(parsed.path) return False class LFSClient: """Base class for LFS client operations.""" def __init__(self, url: str, config: "Config | None" = None) -> None: """Initialize LFS client. Args: url: LFS server URL (http://, https://, or file://) config: Optional git config for authentication/proxy settings """ self._base_url = url.rstrip("/") + "/" # Ensure trailing slash for urljoin self.config = config @property def url(self) -> str: """Get the LFS server URL without trailing slash.""" return self._base_url.rstrip("/") def download(self, oid: str, size: int, ref: str | None = None) -> bytes: """Download an LFS object. Args: oid: Object ID (SHA256) size: Expected size ref: Optional ref name Returns: Object content """ raise NotImplementedError def upload( self, oid: str, size: int, content: bytes, ref: str | None = None ) -> None: """Upload an LFS object. Args: oid: Object ID (SHA256) size: Object size content: Object content ref: Optional ref name """ raise NotImplementedError @classmethod def from_config(cls, config: "Config") -> "LFSClient | None": """Create LFS client from git config. Returns the appropriate subclass (HTTPLFSClient or FileLFSClient) based on the URL scheme. """ # Try to get LFS URL from config first try: url = config.get((b"lfs",), b"url").decode() except KeyError: pass else: # Validate explicitly configured URL - raise error if invalid if not _is_valid_lfs_url(url): raise ValueError( f"Invalid lfs.url in config: {url!r}. " "URL must be an absolute URL with scheme http://, https://, or file://." ) # Return appropriate client based on scheme parsed = urlparse(url) if parsed.scheme in ("http", "https"): return HTTPLFSClient(url, config) elif parsed.scheme == "file": return FileLFSClient(url, config) else: # This shouldn't happen if _is_valid_lfs_url works correctly raise ValueError(f"Unsupported LFS URL scheme: {parsed.scheme}") # Fall back to deriving from remote URL (same as git-lfs) try: remote_url = config.get((b"remote", b"origin"), b"url").decode() except KeyError: pass else: # Convert SSH URLs to HTTPS if needed if remote_url.startswith("git@"): # Convert git@host:user/repo.git to https://host/user/repo.git if ":" in remote_url and "/" in remote_url: host_and_path = remote_url[4:] # Remove "git@" if ":" in host_and_path: host, path = host_and_path.split(":", 1) remote_url = f"https://{host}/{path}" # Ensure URL ends with .git for consistent LFS endpoint if not remote_url.endswith(".git"): remote_url = f"{remote_url}.git" # Standard LFS endpoint is remote_url + "/info/lfs" lfs_url = f"{remote_url}/info/lfs" # Return None if derived URL is invalid (LFS is optional) if not _is_valid_lfs_url(lfs_url): return None # Derived URLs are always http/https return HTTPLFSClient(lfs_url, config) return None class HTTPLFSClient(LFSClient): """LFS client for HTTP/HTTPS operations.""" def __init__(self, url: str, config: "Config | None" = None) -> None: """Initialize HTTP LFS client. Args: url: LFS server URL (http:// or https://) config: Optional git config for authentication/proxy settings """ super().__init__(url, config) self._pool_manager: ( urllib3.PoolManager | urllib3.ProxyManager | AuthCallbackPoolManager | None ) = None def _get_pool_manager( self, ) -> "urllib3.PoolManager | urllib3.ProxyManager | AuthCallbackPoolManager": """Get urllib3 pool manager with git config applied.""" if self._pool_manager is None: from dulwich.client import default_urllib3_manager self._pool_manager = default_urllib3_manager(self.config) return self._pool_manager def _make_request( self, method: str, path: str, data: bytes | None = None, headers: dict[str, str] | None = None, ) -> bytes: """Make an HTTP request to the LFS server.""" url = urljoin(self._base_url, path) req_headers = { "Accept": "application/vnd.git-lfs+json", "Content-Type": "application/vnd.git-lfs+json", "User-Agent": _get_lfs_user_agent(self.config), } if headers: req_headers.update(headers) # Use urllib3 pool manager with git config applied pool_manager = self._get_pool_manager() response = pool_manager.request(method, url, headers=req_headers, body=data) if response.status >= 400: raise ValueError( f"HTTP {response.status}: {response.data.decode('utf-8', errors='ignore')}" ) return response.data def batch( self, operation: str, objects: list[dict[str, str | int]], ref: str | None = None, ) -> LFSBatchResponse: """Perform batch operation to get transfer URLs. Args: operation: "download" or "upload" objects: List of {"oid": str, "size": int} dicts ref: Optional ref name Returns: Batch response from server """ data: dict[ str, str | list[str] | list[dict[str, str | int]] | dict[str, str] ] = { "operation": operation, "transfers": ["basic"], "objects": objects, } if ref: data["ref"] = {"name": ref} response = self._make_request( "POST", "objects/batch", json.dumps(data).encode("utf-8") ) if not response: raise ValueError("Empty response from LFS server") response_data = json.loads(response) return self._parse_batch_response(response_data) def _parse_batch_response(self, data: Mapping[str, Any]) -> LFSBatchResponse: """Parse JSON response into LFSBatchResponse dataclass.""" objects = [] for obj_data in data.get("objects", []): actions = None if "actions" in obj_data: actions = {} for action_name, action_data in obj_data["actions"].items(): actions[action_name] = LFSAction( href=action_data["href"], header=action_data.get("header"), expires_at=action_data.get("expires_at"), ) error = None if "error" in obj_data: error = LFSErrorInfo( code=obj_data["error"]["code"], message=obj_data["error"]["message"] ) batch_obj = LFSBatchObject( oid=obj_data["oid"], size=obj_data["size"], authenticated=obj_data.get("authenticated"), actions=actions, error=error, ) objects.append(batch_obj) return LFSBatchResponse( transfer=data.get("transfer", "basic"), objects=objects, hash_algo=data.get("hash_algo"), ) def download(self, oid: str, size: int, ref: str | None = None) -> bytes: """Download an LFS object. Args: oid: Object ID (SHA256) size: Expected size ref: Optional ref name Returns: Object content """ # Get download URL via batch API batch_resp = self.batch("download", [{"oid": oid, "size": size}], ref) if not batch_resp.objects: raise LFSError(f"No objects returned for {oid}") obj = batch_resp.objects[0] if obj.error: raise LFSError(f"Server error for {oid}: {obj.error.message}") if not obj.actions or "download" not in obj.actions: raise LFSError(f"No download actions for {oid}") download_action = obj.actions["download"] download_url = download_action.href # Download the object using urllib3 with git config download_headers = {"User-Agent": _get_lfs_user_agent(self.config)} if download_action.header: download_headers.update(download_action.header) pool_manager = self._get_pool_manager() response = pool_manager.request("GET", download_url, headers=download_headers) content = response.data # Verify size if len(content) != size: raise LFSError(f"Downloaded size {len(content)} != expected {size}") # Verify SHA256 actual_oid = hashlib.sha256(content).hexdigest() if actual_oid != oid: raise LFSError(f"Downloaded OID {actual_oid} != expected {oid}") return content def upload( self, oid: str, size: int, content: bytes, ref: str | None = None ) -> None: """Upload an LFS object. Args: oid: Object ID (SHA256) size: Object size content: Object content ref: Optional ref name """ # Get upload URL via batch API batch_resp = self.batch("upload", [{"oid": oid, "size": size}], ref) if not batch_resp.objects: raise LFSError(f"No objects returned for {oid}") obj = batch_resp.objects[0] if obj.error: raise LFSError(f"Server error for {oid}: {obj.error.message}") # If no actions, object already exists if not obj.actions: return if "upload" not in obj.actions: raise LFSError(f"No upload action for {oid}") upload_action = obj.actions["upload"] upload_url = upload_action.href # Upload the object req = Request(upload_url, data=content, method="PUT") if upload_action.header: for name, value in upload_action.header.items(): req.add_header(name, value) with urlopen(req) as response: if response.status >= 400: raise LFSError(f"Upload failed with status {response.status}") # Verify if needed if obj.actions and "verify" in obj.actions: verify_action = obj.actions["verify"] verify_data = json.dumps({"oid": oid, "size": size}).encode("utf-8") req = Request(verify_action.href, data=verify_data, method="POST") req.add_header("Content-Type", "application/vnd.git-lfs+json") if verify_action.header: for name, value in verify_action.header.items(): req.add_header(name, value) with urlopen(req) as response: if response.status >= 400: raise LFSError(f"Verification failed with status {response.status}") class FileLFSClient(LFSClient): """LFS client for file:// URLs that accesses local filesystem.""" def __init__(self, url: str, config: "Config | None" = None) -> None: """Initialize File LFS client. Args: url: LFS server URL (file://) config: Optional git config (unused for file:// URLs) """ super().__init__(url, config) # Convert file:// URL to filesystem path from urllib.request import url2pathname parsed = urlparse(url) if parsed.scheme != "file": raise ValueError(f"FileLFSClient requires file:// URL, got {url!r}") # url2pathname handles the conversion properly across platforms path = url2pathname(parsed.path) self._local_store = LFSStore(path) def download(self, oid: str, size: int, ref: str | None = None) -> bytes: """Download an LFS object from local filesystem. Args: oid: Object ID (SHA256) size: Expected size ref: Optional ref name (ignored for file:// URLs) Returns: Object content Raises: LFSError: If object not found or size mismatch """ try: with self._local_store.open_object(oid) as f: content = f.read() except KeyError as exc: raise LFSError(f"Object not found: {oid}") from exc # Verify size if len(content) != size: raise LFSError(f"Size mismatch: expected {size}, got {len(content)}") # Verify SHA256 actual_oid = hashlib.sha256(content).hexdigest() if actual_oid != oid: raise LFSError(f"OID mismatch: expected {oid}, got {actual_oid}") return content def upload( self, oid: str, size: int, content: bytes, ref: str | None = None ) -> None: """Upload an LFS object to local filesystem. Args: oid: Object ID (SHA256) size: Object size content: Object content ref: Optional ref name (ignored for file:// URLs) Raises: LFSError: If size or OID mismatch """ # Verify size if len(content) != size: raise LFSError(f"Size mismatch: expected {size}, got {len(content)}") # Verify SHA256 actual_oid = hashlib.sha256(content).hexdigest() if actual_oid != oid: raise LFSError(f"OID mismatch: expected {oid}, got {actual_oid}") # Store the object stored_oid = self._local_store.write_object([content]) if stored_oid != oid: raise LFSError(f"Storage OID mismatch: expected {oid}, got {stored_oid}") class LFSError(Exception): """LFS-specific error.""" dulwich-1.0.0/dulwich/lfs_server.py000066400000000000000000000231571513301442600173410ustar00rootroot00000000000000# lfs_server.py -- Simple Git LFS server implementation # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Simple Git LFS server implementation for testing.""" __all__ = [ "LFSRequestHandler", "LFSServer", "run_lfs_server", ] import hashlib import json import tempfile import typing from collections.abc import Mapping from http.server import BaseHTTPRequestHandler, HTTPServer from .lfs import LFSStore class LFSRequestHandler(BaseHTTPRequestHandler): """HTTP request handler for LFS operations.""" server: "LFSServer" # Type annotation for the server attribute def send_json_response( self, status_code: int, data: Mapping[str, typing.Any] ) -> None: """Send a JSON response.""" response = json.dumps(data).encode("utf-8") self.send_response(status_code) self.send_header("Content-Type", "application/vnd.git-lfs+json") self.send_header("Content-Length", str(len(response))) self.end_headers() self.wfile.write(response) def do_POST(self) -> None: """Handle POST requests.""" if self.path == "/objects/batch": self.handle_batch() elif self.path.startswith("/objects/") and self.path.endswith("/verify"): self.handle_verify() else: self.send_error(404, "Not Found") def do_PUT(self) -> None: """Handle PUT requests (uploads).""" if self.path.startswith("/objects/"): self.handle_upload() else: self.send_error(404, "Not Found") def do_GET(self) -> None: """Handle GET requests (downloads).""" if self.path.startswith("/objects/"): self.handle_download() else: self.send_error(404, "Not Found") def handle_batch(self) -> None: """Handle batch API requests.""" content_length = int(self.headers["Content-Length"]) request_data = self.rfile.read(content_length) try: batch_request = json.loads(request_data) except json.JSONDecodeError: self.send_error(400, "Invalid JSON") return operation = batch_request.get("operation") objects = batch_request.get("objects", []) if operation not in ["download", "upload"]: self.send_error(400, "Invalid operation") return response_objects = [] for obj in objects: oid = obj.get("oid") size = obj.get("size") if not oid or size is None: response_objects.append( { "oid": oid, "size": size, "error": {"code": 400, "message": "Missing oid or size"}, } ) continue response_obj = { "oid": oid, "size": size, } if operation == "download": # Check if object exists if self._object_exists(oid): response_obj["actions"] = { "download": { "href": f"http://{self.headers['Host']}/objects/{oid}", "header": {"Accept": "application/octet-stream"}, } } else: response_obj["error"] = {"code": 404, "message": "Object not found"} else: # upload response_obj["actions"] = { "upload": { "href": f"http://{self.headers['Host']}/objects/{oid}", "header": {"Content-Type": "application/octet-stream"}, }, "verify": { "href": f"http://{self.headers['Host']}/objects/{oid}/verify" }, } response_objects.append(response_obj) self.send_json_response(200, {"objects": response_objects}) def handle_download(self) -> None: """Handle object download requests.""" # Extract OID from path path_parts = self.path.strip("/").split("/") if len(path_parts) != 2: self.send_error(404, "Not Found") return oid = path_parts[1] try: with self.server.lfs_store.open_object(oid) as f: content = f.read() self.send_response(200) self.send_header("Content-Type", "application/octet-stream") self.send_header("Content-Length", str(len(content))) self.end_headers() self.wfile.write(content) except KeyError: self.send_error(404, "Object not found") def handle_upload(self) -> None: """Handle object upload requests.""" # Extract OID from path path_parts = self.path.strip("/").split("/") if len(path_parts) != 2: self.send_error(404, "Not Found") return oid = path_parts[1] content_length = int(self.headers["Content-Length"]) # Read content in chunks chunks = [] remaining = content_length while remaining > 0: chunk_size = min(8192, remaining) chunk = self.rfile.read(chunk_size) if not chunk: break chunks.append(chunk) remaining -= len(chunk) # Calculate SHA256 content = b"".join(chunks) calculated_oid = hashlib.sha256(content).hexdigest() # Verify OID matches if calculated_oid != oid: self.send_error(400, f"OID mismatch: expected {oid}, got {calculated_oid}") return # Check if object already exists if not self._object_exists(oid): # Store the object only if it doesn't exist self.server.lfs_store.write_object(chunks) self.send_response(200) self.end_headers() def handle_verify(self) -> None: """Handle object verification requests.""" # Extract OID from path path_parts = self.path.strip("/").split("/") if len(path_parts) != 3 or path_parts[2] != "verify": self.send_error(404, "Not Found") return oid = path_parts[1] content_length = int(self.headers.get("Content-Length", 0)) if content_length > 0: request_data = self.rfile.read(content_length) try: verify_request = json.loads(request_data) # Optionally validate size if "size" in verify_request: # Could verify size matches stored object pass except json.JSONDecodeError: pass # Check if object exists if self._object_exists(oid): self.send_response(200) self.end_headers() else: self.send_error(404, "Object not found") def _object_exists(self, oid: str) -> bool: """Check if an object exists in the store.""" try: # Try to open the object - if it exists, close it immediately with self.server.lfs_store.open_object(oid): return True except KeyError: return False def log_message(self, format: str, *args: object) -> None: """Override to suppress request logging during tests.""" if self.server.log_requests: super().log_message(format, *args) class LFSServer(HTTPServer): """Simple LFS server for testing.""" def __init__( self, server_address: tuple[str, int], lfs_store: LFSStore, log_requests: bool = False, ) -> None: """Initialize LFSServer. Args: server_address: Tuple of (host, port) to bind to lfs_store: LFS store instance to use log_requests: Whether to log incoming requests """ super().__init__(server_address, LFSRequestHandler) self.lfs_store = lfs_store self.log_requests = log_requests def run_lfs_server( host: str = "localhost", port: int = 0, lfs_dir: str | None = None, log_requests: bool = False, ) -> tuple[LFSServer, str]: """Run an LFS server. Args: host: Host to bind to port: Port to bind to (0 for random) lfs_dir: Directory for LFS storage (temp dir if None) log_requests: Whether to log HTTP requests Returns: Tuple of (server, url) where url is the base URL for the server """ if lfs_dir is None: lfs_dir = tempfile.mkdtemp() lfs_store = LFSStore.create(lfs_dir) server = LFSServer((host, port), lfs_store, log_requests) # Get the actual port if we used 0 actual_port = server.server_address[1] url = f"http://{host}:{actual_port}" return server, url dulwich-1.0.0/dulwich/line_ending.py000066400000000000000000000565751513301442600174540ustar00rootroot00000000000000# line_ending.py -- Line ending conversion functions # Copyright (C) 2018-2018 Boris Feld # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # r"""All line-ending related functions, from conversions to config processing. Line-ending normalization is a complex beast. Here is some notes and details about how it seems to work. The normalization is a two-fold process that happens at two moments: - When reading a file from the index and to the working directory. For example when doing a ``git clone`` or ``git checkout`` call. This is called the smudge filter (repository -> working tree). - When writing a file to the index from the working directory. For example when doing a ``git add`` call. This is called the clean filter (working tree -> repository). Note that when checking status (getting unstaged changes), whether or not normalization is done on write depends on whether or not the file in the working dir has also been normalized on read: - For autocrlf=true all files are always normalized on both read and write. - For autocrlf=input files are only normalized on write if they are newly "added". Since files which are already committed are not normalized on checkout into the working tree, they are also left alone when staging modifications into the index. One thing to know is that Git does line-ending normalization only on text files. How does Git know that a file is text? We can either mark a file as a text file, a binary file or ask Git to automatically decides. Git has an heuristic to detect if a file is a text file or a binary file. It seems based on the percentage of non-printable characters in files. The code for this heuristic is here: https://git.kernel.org/pub/scm/git/git.git/tree/convert.c#n46 Dulwich have an implementation with a slightly different heuristic, the `dulwich.patch.is_binary` function. The binary detection heuristic implementation is close to the one in JGit: https://github.com/eclipse/jgit/blob/f6873ffe522bbc3536969a3a3546bf9a819b92bf/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java#L300 There is multiple variables that impact the normalization. First, a repository can contains a ``.gitattributes`` file (or more than one...) that can further customize the operation on some file patterns, for example: \*.txt text Force all ``.txt`` files to be treated as text files and to have their lines endings normalized. \*.jpg -text Force all ``.jpg`` files to be treated as binary files and to not have their lines endings converted. \*.vcproj text eol=crlf Force all ``.vcproj`` files to be treated as text files and to have their lines endings converted into ``CRLF`` in working directory no matter the native EOL of the platform. \*.sh text eol=lf Force all ``.sh`` files to be treated as text files and to have their lines endings converted into ``LF`` in working directory no matter the native EOL of the platform. If the ``eol`` attribute is not defined, Git uses the ``core.eol`` configuration value described later. \* text=auto Force all files to be scanned by the text file heuristic detection and to have their line endings normalized in case they are detected as text files. Git also have a obsolete attribute named ``crlf`` that can be translated to the corresponding text attribute value. Then there are some configuration option (that can be defined at the repository or user level): - core.autocrlf - core.eol ``core.autocrlf`` is taken into account for all files that doesn't have a ``text`` attribute defined in ``.gitattributes``; it takes three possible values: - ``true``: This forces all files on the working directory to have CRLF line-endings in the working directory and convert line-endings to LF when writing to the index. When autocrlf is set to true, eol value is ignored. - ``input``: Quite similar to the ``true`` value but only applies the clean filter, ie line-ending of new files added to the index will get their line-endings converted to LF. - ``false`` (default): No normalization is done. ``core.eol`` is the top-level configuration to define the line-ending to use when applying the smudge filter. It takes three possible values: - ``lf``: When normalization is done, force line-endings to be ``LF`` in the working directory. - ``crlf``: When normalization is done, force line-endings to be ``CRLF`` in the working directory. - ``native`` (default): When normalization is done, force line-endings to be the platform's native line ending. One thing to remember is when line-ending normalization is done on a file, Git always normalize line-ending to ``LF`` when writing to the index. There are sources that seems to indicate that Git won't do line-ending normalization when a file contains mixed line-endings. I think this logic might be in text / binary detection heuristic but couldn't find it yet. Sources: - https://git-scm.com/docs/git-config#git-config-coreeol - https://git-scm.com/docs/git-config#git-config-coreautocrlf - https://git-scm.com/docs/gitattributes#_checking_out_and_checking_in - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/ """ __all__ = [ "CRLF", "LF", "BlobNormalizer", "LineEndingFilter", "TreeBlobNormalizer", "check_safecrlf", "convert_crlf_to_lf", "convert_lf_to_crlf", "get_clean_filter", "get_clean_filter_autocrlf", "get_smudge_filter", "get_smudge_filter_autocrlf", "normalize_blob", ] import logging from collections.abc import Callable, Mapping from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from .config import StackedConfig from .object_store import BaseObjectStore from .attrs import GitAttributes, Pattern from .filters import FilterBlobNormalizer, FilterContext, FilterDriver, FilterRegistry from .object_store import iter_tree_contents from .objects import Blob, ObjectID from .patch import is_binary CRLF = b"\r\n" LF = b"\n" logger = logging.getLogger(__name__) class LineEndingFilter(FilterDriver): """Filter driver for line ending conversion.""" def __init__( self, clean_conversion: Callable[[bytes], bytes] | None = None, smudge_conversion: Callable[[bytes], bytes] | None = None, binary_detection: bool = True, safecrlf: bytes = b"false", ): """Initialize LineEndingFilter.""" self.clean_conversion = clean_conversion self.smudge_conversion = smudge_conversion self.binary_detection = binary_detection self.safecrlf = safecrlf @classmethod def from_config( cls, config: "StackedConfig | None", for_text_attr: bool = False ) -> "LineEndingFilter": """Create a LineEndingFilter from git configuration. Args: config: Git configuration stack for_text_attr: If True, always normalize on checkin (for text attribute) Returns: Configured LineEndingFilter instance """ if config is None: # Default filter if for_text_attr: # For text attribute: always normalize on checkin return cls( clean_conversion=convert_crlf_to_lf, smudge_conversion=None, binary_detection=True, ) else: # No config: no conversion return cls() # Get core.eol setting try: core_eol_raw = config.get("core", "eol") core_eol: str = ( core_eol_raw.decode("ascii") if isinstance(core_eol_raw, bytes) else str(core_eol_raw) ) except KeyError: core_eol = "native" # Get core.autocrlf setting try: autocrlf_raw = config.get("core", "autocrlf") autocrlf: bytes = ( autocrlf_raw.lower() if isinstance(autocrlf_raw, bytes) else str(autocrlf_raw).lower().encode("ascii") ) except KeyError: autocrlf = b"false" # Get core.safecrlf setting try: safecrlf_raw = config.get("core", "safecrlf") safecrlf = ( safecrlf_raw if isinstance(safecrlf_raw, bytes) else safecrlf_raw.encode("utf-8") ) except KeyError: safecrlf = b"false" if for_text_attr: # For text attribute: always normalize to LF on checkin # Smudge behavior depends on core.eol and core.autocrlf smudge_filter = get_smudge_filter(core_eol, autocrlf) clean_filter: Callable[[bytes], bytes] | None = convert_crlf_to_lf else: # Normal autocrlf behavior smudge_filter = get_smudge_filter(core_eol, autocrlf) clean_filter = get_clean_filter(core_eol, autocrlf) return cls( clean_conversion=clean_filter, smudge_conversion=smudge_filter, binary_detection=True, safecrlf=safecrlf, ) def clean(self, data: bytes, path: bytes = b"") -> bytes: """Apply line ending conversion for checkin (working tree -> repository).""" if self.clean_conversion is None: return data # Skip binary files if detection is enabled if self.binary_detection and is_binary(data): return data converted = self.clean_conversion(data) # Check if conversion is safe if self.safecrlf != b"false": check_safecrlf(data, converted, self.safecrlf, path) return converted def smudge(self, data: bytes, path: bytes = b"") -> bytes: """Apply line ending conversion for checkout (repository -> working tree).""" if self.smudge_conversion is None: return data # Skip binary files if detection is enabled if self.binary_detection and is_binary(data): return data converted = self.smudge_conversion(data) # Check if conversion is safe if self.safecrlf != b"false": check_safecrlf(data, converted, self.safecrlf, path) return converted def cleanup(self) -> None: """Clean up any resources held by this filter driver.""" # LineEndingFilter doesn't hold any resources that need cleanup def reuse(self, config: "StackedConfig", filter_name: str) -> bool: """Check if this filter driver should be reused with the given configuration.""" # LineEndingFilter is lightweight and should always be recreated # to ensure it uses the latest configuration return False def convert_crlf_to_lf(text_hunk: bytes) -> bytes: """Convert CRLF in text hunk into LF. Args: text_hunk: A bytes string representing a text hunk Returns: The text hunk with the same type, with CRLF replaced into LF """ return text_hunk.replace(CRLF, LF) def convert_lf_to_crlf(text_hunk: bytes) -> bytes: """Convert LF in text hunk into CRLF. Args: text_hunk: A bytes string representing a text hunk Returns: The text hunk with the same type, with LF replaced into CRLF """ # Single-pass conversion: split on LF and join with CRLF # This avoids the double replacement issue parts = text_hunk.split(LF) # Remove any trailing CR to avoid CRCRLF cleaned_parts = [] for i, part in enumerate(parts): if i < len(parts) - 1 and part.endswith(b"\r"): cleaned_parts.append(part[:-1]) else: cleaned_parts.append(part) return CRLF.join(cleaned_parts) def check_safecrlf( original: bytes, converted: bytes, safecrlf: bytes, path: bytes = b"" ) -> None: """Check if CRLF conversion is safe according to core.safecrlf setting. Args: original: Original content before conversion converted: Content after conversion safecrlf: Value of core.safecrlf config (b"true", b"warn", or b"false") path: Path to the file being checked (for error messages) Raises: ValueError: If safecrlf is "true" and conversion would lose data """ if safecrlf == b"false": return # Check if conversion is reversible if safecrlf in (b"true", b"warn"): # For CRLF->LF conversion, check if converting back would recover original if CRLF in original and CRLF not in converted: # This was a CRLF->LF conversion recovered = convert_lf_to_crlf(converted) if recovered != original: msg = ( f"CRLF would be replaced by LF in {path.decode('utf-8', 'replace')}" ) if safecrlf == b"true": raise ValueError(msg) else: # warn logger.warning(msg) # For LF->CRLF conversion, check if converting back would recover original elif LF in original and CRLF in converted and CRLF not in original: # This was a LF->CRLF conversion recovered = convert_crlf_to_lf(converted) if recovered != original: msg = ( f"LF would be replaced by CRLF in {path.decode('utf-8', 'replace')}" ) if safecrlf == b"true": raise ValueError(msg) else: # warn logger.warning(msg) def get_smudge_filter( core_eol: str, core_autocrlf: bytes ) -> Callable[[bytes], bytes] | None: """Returns the correct smudge filter based on the passed arguments.""" # Git attributes handling is done by the filter infrastructure return get_smudge_filter_autocrlf(core_autocrlf) def get_clean_filter( core_eol: str, core_autocrlf: bytes ) -> Callable[[bytes], bytes] | None: """Returns the correct clean filter based on the passed arguments.""" # Git attributes handling is done by the filter infrastructure return get_clean_filter_autocrlf(core_autocrlf) def get_smudge_filter_autocrlf( core_autocrlf: bytes, ) -> Callable[[bytes], bytes] | None: """Returns the correct smudge filter base on autocrlf value. Args: core_autocrlf: The bytes configuration value of core.autocrlf. Valid values are: b'true', b'false' or b'input'. Returns: Either None if no filter has to be applied or a function accepting a single argument, a binary text hunk """ if core_autocrlf == b"true": return convert_lf_to_crlf return None def get_clean_filter_autocrlf( core_autocrlf: bytes, ) -> Callable[[bytes], bytes] | None: """Returns the correct clean filter base on autocrlf value. Args: core_autocrlf: The bytes configuration value of core.autocrlf. Valid values are: b'true', b'false' or b'input'. Returns: Either None if no filter has to be applied or a function accepting a single argument, a binary text hunk """ if core_autocrlf == b"true" or core_autocrlf == b"input": return convert_crlf_to_lf # Checking filter should never be `convert_lf_to_crlf` return None class BlobNormalizer(FilterBlobNormalizer): """An object to store computation result of which filter to apply based on configuration, gitattributes, path and operation (checkin or checkout). This class maintains backward compatibility while using the filter infrastructure. """ def __init__( self, config_stack: "StackedConfig", gitattributes: Mapping[str, Any], core_eol: str = "native", autocrlf: bytes = b"false", safecrlf: bytes = b"false", ) -> None: """Initialize FilteringBlobNormalizer.""" # Set up a filter registry with line ending filters filter_registry = FilterRegistry(config_stack) # Create line ending filter if needed smudge_filter = get_smudge_filter(core_eol, autocrlf) clean_filter = get_clean_filter(core_eol, autocrlf) # Always register a text filter that can be used by gitattributes # Even if autocrlf is false, gitattributes text=true should work line_ending_filter = LineEndingFilter( clean_conversion=clean_filter or convert_crlf_to_lf, smudge_conversion=smudge_filter or convert_lf_to_crlf, binary_detection=True, safecrlf=safecrlf, ) filter_registry.register_driver("text", line_ending_filter) # Convert dict gitattributes to GitAttributes object for parent class git_attrs_patterns = [] for pattern_str, attrs in gitattributes.items(): if isinstance(pattern_str, str): pattern_bytes = pattern_str.encode("utf-8") else: pattern_bytes = pattern_str pattern = Pattern(pattern_bytes) git_attrs_patterns.append((pattern, attrs)) git_attributes = GitAttributes(git_attrs_patterns) # Create FilterContext for parent class filter_context = FilterContext(filter_registry) # Initialize parent class with gitattributes # The filter infrastructure will handle gitattributes processing super().__init__(config_stack, git_attributes, filter_context=filter_context) # Store original filters for backward compatibility self.fallback_read_filter = smudge_filter self.fallback_write_filter = clean_filter def checkin_normalize(self, blob: Blob, tree_path: bytes) -> Blob: """Normalize a blob during a checkin operation.""" # First try to get filter from gitattributes (handled by parent) result = super().checkin_normalize(blob, tree_path) # Check if gitattributes explicitly disabled text conversion attrs = self.gitattributes.match_path(tree_path) if b"text" in attrs and attrs[b"text"] is False: # Explicitly marked as binary, no conversion return blob # If no filter was applied via gitattributes and we have a fallback filter # (autocrlf is enabled), apply it to all files if result is blob and self.fallback_write_filter is not None: # Apply the clean filter with binary detection # Get safecrlf from config safecrlf = b"false" if hasattr(self, "filter_registry") and hasattr( self.filter_registry, "config_stack" ): safecrlf = self.filter_registry.config_stack.get( b"core", b"safecrlf", b"false" ) if hasattr(safecrlf, "encode"): safecrlf = safecrlf.encode("utf-8") line_ending_filter = LineEndingFilter( clean_conversion=self.fallback_write_filter, smudge_conversion=None, binary_detection=True, safecrlf=safecrlf, ) filtered_data = line_ending_filter.clean(blob.data, tree_path) if filtered_data != blob.data: new_blob = Blob() new_blob.data = filtered_data return new_blob return result def checkout_normalize(self, blob: Blob, tree_path: bytes) -> Blob: """Normalize a blob during a checkout operation.""" # First try to get filter from gitattributes (handled by parent) result = super().checkout_normalize(blob, tree_path) # Check if gitattributes explicitly disabled text conversion attrs = self.gitattributes.match_path(tree_path) if b"text" in attrs and attrs[b"text"] is False: # Explicitly marked as binary, no conversion return blob # If no filter was applied via gitattributes and we have a fallback filter # (autocrlf is enabled), apply it to all files if result is blob and self.fallback_read_filter is not None: # Apply the smudge filter with binary detection # Get safecrlf from config safecrlf = b"false" if hasattr(self, "filter_registry") and hasattr( self.filter_registry, "config_stack" ): safecrlf = self.filter_registry.config_stack.get( b"core", b"safecrlf", b"false" ) if hasattr(safecrlf, "encode"): safecrlf = safecrlf.encode("utf-8") line_ending_filter = LineEndingFilter( clean_conversion=None, smudge_conversion=self.fallback_read_filter, binary_detection=True, safecrlf=safecrlf, ) filtered_data = line_ending_filter.smudge(blob.data, tree_path) if filtered_data != blob.data: new_blob = Blob() new_blob.data = filtered_data return new_blob return result def normalize_blob( blob: Blob, conversion: Callable[[bytes], bytes], binary_detection: bool ) -> Blob: """Normalize blob by applying line ending conversion.""" # Read the original blob data = blob.data # If we need to detect if a file is binary and the file is detected as # binary, do not apply the conversion function and return the original # chunked text if binary_detection is True: if is_binary(data): return blob # Now apply the conversion converted_data = conversion(data) new_blob = Blob() new_blob.data = converted_data return new_blob class TreeBlobNormalizer(BlobNormalizer): """Blob normalizer that tracks existing files in a tree.""" def __init__( self, config_stack: "StackedConfig", git_attributes: Mapping[str, Any], object_store: "BaseObjectStore", tree: ObjectID | None = None, core_eol: str = "native", autocrlf: bytes = b"false", safecrlf: bytes = b"false", ) -> None: """Initialize TreeBlobNormalizer.""" super().__init__(config_stack, git_attributes, core_eol, autocrlf, safecrlf) if tree: self.existing_paths = { name for name, _, _ in iter_tree_contents(object_store, tree) } else: self.existing_paths = set() def checkin_normalize(self, blob: Blob, tree_path: bytes) -> Blob: """Normalize blob for checkin, considering existing tree state.""" # Existing files should only be normalized on checkin if: # 1. They were previously normalized on checkout (autocrlf=true), OR # 2. We have a write filter (autocrlf=true or autocrlf=input), OR # 3. They are new files if ( self.fallback_read_filter is not None or self.fallback_write_filter is not None or tree_path not in self.existing_paths ): return super().checkin_normalize(blob, tree_path) return blob dulwich-1.0.0/dulwich/log_utils.py000066400000000000000000000135571513301442600171730ustar00rootroot00000000000000# log_utils.py -- Logging utilities for Dulwich # Copyright (C) 2010 Google, Inc. # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Logging utilities for Dulwich. Any module that uses logging needs to do compile-time initialization to set up the logging environment. Since Dulwich is also used as a library, clients may not want to see any logging output. In that case, we need to use a special handler to suppress spurious warnings like "No handlers could be found for logger dulwich.foo". For details on the _NullHandler approach, see: http://docs.python.org/library/logging.html#configuring-logging-for-a-library For many modules, the only function from the logging module they need is getLogger; this module exports that function for convenience. If a calling module needs something else, it can import the standard logging module directly. """ __all__ = [ "default_logging_config", "getLogger", "remove_null_handler", ] import logging import os import sys getLogger = logging.getLogger class _NullHandler(logging.Handler): """No-op logging handler to avoid unexpected logging warnings.""" def emit(self, record: logging.LogRecord) -> None: pass _NULL_HANDLER = _NullHandler() _DULWICH_LOGGER = getLogger("dulwich") _DULWICH_LOGGER.addHandler(_NULL_HANDLER) def _should_trace() -> bool: """Check if GIT_TRACE is enabled. Returns True if tracing should be enabled, False otherwise. """ trace_value = os.environ.get("GIT_TRACE", "") if not trace_value or trace_value.lower() in ("0", "false"): return False return True def _get_trace_target() -> str | int | None: """Get the trace target from GIT_TRACE environment variable. Returns: - None if tracing is disabled - 2 for stderr output (values "1", "2", "true") - int (3-9) for file descriptor - str for file path (absolute paths or directories) """ trace_value = os.environ.get("GIT_TRACE", "") if not trace_value or trace_value.lower() in ("0", "false"): return None if trace_value.lower() in ("1", "2", "true"): return 2 # stderr # Check if it's a file descriptor (integer 3-9) try: fd = int(trace_value) if 3 <= fd <= 9: return fd except ValueError: pass # If it's an absolute path, return it as a string if os.path.isabs(trace_value): return trace_value # For any other value, treat it as disabled return None def _configure_logging_from_trace() -> bool: """Configure logging based on GIT_TRACE environment variable. Returns True if trace configuration was successful, False otherwise. """ trace_target = _get_trace_target() if trace_target is None: return False trace_format = "%(asctime)s %(name)s %(levelname)s: %(message)s" if trace_target == 2: # stderr logging.basicConfig(level=logging.DEBUG, stream=sys.stderr, format=trace_format) return True if isinstance(trace_target, int): # File descriptor try: stream = os.fdopen(trace_target, "w", buffering=1) logging.basicConfig(level=logging.DEBUG, stream=stream, format=trace_format) return True except OSError as e: sys.stderr.write( f"Warning: Failed to open GIT_TRACE fd {trace_target}: {e}\n" ) return False # File path try: if os.path.isdir(trace_target): # For directories, create a file per process filename = os.path.join(trace_target, f"trace.{os.getpid()}") else: filename = trace_target logging.basicConfig( level=logging.DEBUG, filename=filename, filemode="a", format=trace_format ) return True except OSError as e: sys.stderr.write( f"Warning: Failed to open GIT_TRACE file {trace_target}: {e}\n" ) return False def default_logging_config() -> None: """Set up the default Dulwich loggers. Respects the GIT_TRACE environment variable for trace output: - If GIT_TRACE is set to "1", "2", or "true", trace to stderr - If GIT_TRACE is set to an integer 3-9, trace to that file descriptor - If GIT_TRACE is set to an absolute path, trace to that file - If the path is a directory, trace to files in that directory (per process) - Otherwise, use default stderr output """ remove_null_handler() # Try to configure from GIT_TRACE, fall back to default if it fails if not _configure_logging_from_trace(): logging.basicConfig( level=logging.INFO, stream=sys.stderr, format="%(asctime)s %(levelname)s: %(message)s", ) def remove_null_handler() -> None: """Remove the null handler from the Dulwich loggers. If a caller wants to set up logging using something other than default_logging_config, calling this function first is a minor optimization to avoid the overhead of using the _NullHandler. """ _DULWICH_LOGGER.removeHandler(_NULL_HANDLER) dulwich-1.0.0/dulwich/lru_cache.py000066400000000000000000000377651513301442600171260ustar00rootroot00000000000000# lru_cache.py -- Simple LRU cache for dulwich # Copyright (C) 2006, 2008 Canonical Ltd # Copyright (C) 2022 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """A simple least-recently-used (LRU) cache.""" __all__ = [ "LRUCache", "LRUSizeCache", ] from collections.abc import Callable, Iterable, Iterator from typing import Generic, TypeVar, cast _null_key = object() K = TypeVar("K") V = TypeVar("V") class _LRUNode(Generic[K, V]): """This maintains the linked-list which is the lru internals.""" __slots__ = ("cleanup", "key", "next_key", "prev", "size", "value") prev: "_LRUNode[K, V] | None" next_key: K | object size: int | None def __init__( self, key: K, value: V, cleanup: Callable[[K, V], None] | None = None ) -> None: self.prev = None self.next_key = _null_key self.key = key self.value = value self.cleanup = cleanup # TODO: We could compute this 'on-the-fly' like we used to, and remove # one pointer from this object, we just need to decide if it # actually costs us much of anything in normal usage self.size = None def __repr__(self) -> str: if self.prev is None: prev_key = None else: prev_key = self.prev.key return f"{self.__class__.__name__}({self.key!r} n:{self.next_key!r} p:{prev_key!r})" def run_cleanup(self) -> None: if self.cleanup is not None: self.cleanup(self.key, self.value) self.cleanup = None # Just make sure to break any refcycles, etc del self.value class LRUCache(Generic[K, V]): """A class which manages a cache of entries, removing unused ones.""" _least_recently_used: _LRUNode[K, V] | None _most_recently_used: _LRUNode[K, V] | None def __init__( self, max_cache: int = 100, after_cleanup_count: int | None = None ) -> None: """Initialize LRUCache. Args: max_cache: Maximum number of entries to cache after_cleanup_count: Number of entries to keep after cleanup """ self._cache: dict[K, _LRUNode[K, V]] = {} # The "HEAD" of the lru linked list self._most_recently_used = None # The "TAIL" of the lru linked list self._least_recently_used = None self._update_max_cache(max_cache, after_cleanup_count) def __contains__(self, key: K) -> bool: """Check if key is in cache.""" return key in self._cache def __getitem__(self, key: K) -> V: """Get item from cache and mark as recently used.""" cache = self._cache node = cache[key] # Inlined from _record_access to decrease the overhead of __getitem__ # We also have more knowledge about structure if __getitem__ is # succeeding, then we know that self._most_recently_used must not be # None, etc. mru = self._most_recently_used if node is mru: # Nothing to do, this node is already at the head of the queue return node.value # Remove this node from the old location node_prev = node.prev next_key = node.next_key # benchmarking shows that the lookup of _null_key in globals is faster # than the attribute lookup for (node is self._least_recently_used) if next_key is _null_key: # 'node' is the _least_recently_used, because it doesn't have a # 'next' item. So move the current lru to the previous node. self._least_recently_used = node_prev else: node_next = cache[cast(K, next_key)] node_next.prev = node_prev assert node_prev assert mru node_prev.next_key = next_key # Insert this node at the front of the list node.next_key = mru.key mru.prev = node self._most_recently_used = node node.prev = None return node.value def __len__(self) -> int: """Return number of items in cache.""" return len(self._cache) def _walk_lru(self) -> Iterator[_LRUNode[K, V]]: """Walk the LRU list, only meant to be used in tests.""" node = self._most_recently_used if node is not None: if node.prev is not None: raise AssertionError( "the _most_recently_used entry is not" " supposed to have a previous entry" f" {node}" ) while node is not None: if node.next_key is _null_key: if node is not self._least_recently_used: raise AssertionError( f"only the last node should have no next value: {node}" ) node_next = None else: node_next = self._cache[cast(K, node.next_key)] if node_next.prev is not node: raise AssertionError( f"inconsistency found, node.next.prev != node: {node}" ) if node.prev is None: if node is not self._most_recently_used: raise AssertionError( "only the _most_recently_used should" f" not have a previous node: {node}" ) else: if node.prev.next_key != node.key: raise AssertionError( f"inconsistency found, node.prev.next != node: {node}" ) yield node node = node_next def add( self, key: K, value: V, cleanup: Callable[[K, V], None] | None = None ) -> None: """Add a new value to the cache. Also, if the entry is ever removed from the cache, call cleanup(key, value). Args: key: The key to store it under value: The object to store cleanup: None or a function taking (key, value) to indicate 'value' should be cleaned up. """ if key is _null_key: raise ValueError("cannot use _null_key as a key") if key in self._cache: node = self._cache[key] node.run_cleanup() node.value = value node.cleanup = cleanup else: node = _LRUNode(key, value, cleanup=cleanup) self._cache[key] = node self._record_access(node) if len(self._cache) > self._max_cache: # Trigger the cleanup self.cleanup() def cache_size(self) -> int: """Get the number of entries we will cache.""" return self._max_cache def get(self, key: K, default: V | None = None) -> V | None: """Get value from cache with default if not found. Args: key: Key to look up default: Default value if key not found Returns: Value from cache or default """ node = self._cache.get(key, None) if node is None: return default self._record_access(node) return node.value def keys(self) -> Iterable[K]: """Get the list of keys currently cached. Note that values returned here may not be available by the time you request them later. This is simply meant as a peak into the current state. Returns: An unordered list of keys that are currently cached. """ return self._cache.keys() def items(self) -> dict[K, V]: """Get the key:value pairs as a dict.""" return {k: n.value for k, n in self._cache.items()} def cleanup(self) -> None: """Clear the cache until it shrinks to the requested size. This does not completely wipe the cache, just makes sure it is under the after_cleanup_count. """ # Make sure the cache is shrunk to the correct size while len(self._cache) > self._after_cleanup_count: self._remove_lru() def __setitem__(self, key: K, value: V) -> None: """Add a value to the cache, there will be no cleanup function.""" self.add(key, value, cleanup=None) def _record_access(self, node: _LRUNode[K, V]) -> None: """Record that key was accessed.""" # Move 'node' to the front of the queue if self._most_recently_used is None: self._most_recently_used = node self._least_recently_used = node return elif node is self._most_recently_used: # Nothing to do, this node is already at the head of the queue return # We've taken care of the tail pointer, remove the node, and insert it # at the front # REMOVE if node is self._least_recently_used: self._least_recently_used = node.prev if node.prev is not None: node.prev.next_key = node.next_key if node.next_key is not _null_key: node_next = self._cache[cast(K, node.next_key)] node_next.prev = node.prev # INSERT node.next_key = self._most_recently_used.key self._most_recently_used.prev = node self._most_recently_used = node node.prev = None def _remove_node(self, node: _LRUNode[K, V]) -> None: if node is self._least_recently_used: self._least_recently_used = node.prev self._cache.pop(node.key) # If we have removed all entries, remove the head pointer as well if self._least_recently_used is None: self._most_recently_used = None node.run_cleanup() # Now remove this node from the linked list if node.prev is not None: node.prev.next_key = node.next_key if node.next_key is not _null_key: node_next = self._cache[cast(K, node.next_key)] node_next.prev = node.prev # And remove this node's pointers node.prev = None node.next_key = _null_key def _remove_lru(self) -> None: """Remove one entry from the lru, and handle consequences. If there are no more references to the lru, then this entry should be removed from the cache. """ assert self._least_recently_used self._remove_node(self._least_recently_used) def clear(self) -> None: """Clear out all of the cache.""" # Clean up in LRU order while self._cache: self._remove_lru() def resize(self, max_cache: int, after_cleanup_count: int | None = None) -> None: """Change the number of entries that will be cached.""" self._update_max_cache(max_cache, after_cleanup_count=after_cleanup_count) def _update_max_cache( self, max_cache: int, after_cleanup_count: int | None = None ) -> None: self._max_cache = max_cache if after_cleanup_count is None: self._after_cleanup_count = self._max_cache * 8 / 10 else: self._after_cleanup_count = min(after_cleanup_count, self._max_cache) self.cleanup() class LRUSizeCache(LRUCache[K, V]): """An LRUCache that removes things based on the size of the values. This differs in that it doesn't care how many actual items there are, it just restricts the cache to be cleaned up after so much data is stored. The size of items added will be computed using compute_size(value), which defaults to len() if not supplied. """ _compute_size: Callable[[V], int] def __init__( self, max_size: int = 1024 * 1024, after_cleanup_size: int | None = None, compute_size: Callable[[V], int] | None = None, ) -> None: """Create a new LRUSizeCache. Args: max_size: The max number of bytes to store before we start clearing out entries. after_cleanup_size: After cleaning up, shrink everything to this size. compute_size: A function to compute the size of the values. We use a function here, so that you can pass 'len' if you are just using simple strings, or a more complex function if you are using something like a list of strings, or even a custom object. The function should take the form "compute_size(value) => integer". If not supplied, it defaults to 'len()' """ self._value_size = 0 if compute_size is None: self._compute_size = cast(Callable[[V], int], len) else: self._compute_size = compute_size self._update_max_size(max_size, after_cleanup_size=after_cleanup_size) LRUCache.__init__(self, max_cache=max(int(max_size / 512), 1)) def add( self, key: K, value: V, cleanup: Callable[[K, V], None] | None = None ) -> None: """Add a new value to the cache. Also, if the entry is ever removed from the cache, call cleanup(key, value). Args: key: The key to store it under value: The object to store cleanup: None or a function taking (key, value) to indicate 'value' should be cleaned up. """ if key is _null_key: raise ValueError("cannot use _null_key as a key") node = self._cache.get(key, None) value_len = self._compute_size(value) if value_len >= self._after_cleanup_size: # The new value is 'too big to fit', as it would fill up/overflow # the cache all by itself if node is not None: # We won't be replacing the old node, so just remove it self._remove_node(node) if cleanup is not None: cleanup(key, value) return if node is None: node = _LRUNode(key, value, cleanup=cleanup) self._cache[key] = node else: assert node.size is not None self._value_size -= node.size node.size = value_len self._value_size += value_len self._record_access(node) if self._value_size > self._max_size: # Time to cleanup self.cleanup() def cleanup(self) -> None: """Clear the cache until it shrinks to the requested size. This does not completely wipe the cache, just makes sure it is under the after_cleanup_size. """ # Make sure the cache is shrunk to the correct size while self._value_size > self._after_cleanup_size: self._remove_lru() def _remove_node(self, node: _LRUNode[K, V]) -> None: assert node.size is not None self._value_size -= node.size LRUCache._remove_node(self, node) def resize(self, max_size: int, after_cleanup_size: int | None = None) -> None: """Change the number of bytes that will be cached.""" self._update_max_size(max_size, after_cleanup_size=after_cleanup_size) max_cache = max(int(max_size / 512), 1) self._update_max_cache(max_cache) def _update_max_size( self, max_size: int, after_cleanup_size: int | None = None ) -> None: self._max_size = max_size if after_cleanup_size is None: self._after_cleanup_size = self._max_size * 8 // 10 else: self._after_cleanup_size = min(after_cleanup_size, self._max_size) dulwich-1.0.0/dulwich/mailmap.py000066400000000000000000000127651513301442600166120ustar00rootroot00000000000000# mailmap.py -- Mailmap reader # Copyright (C) 2018 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Mailmap file reader.""" __all__ = [ "Mailmap", "parse_identity", "read_mailmap", ] from collections.abc import Iterator from typing import IO def parse_identity(text: bytes) -> tuple[bytes | None, bytes | None]: """Parse an identity string into name and email. Args: text: Identity string in format "Name " Returns: Tuple of (name, email) where either can be None """ # TODO(jelmer): Integrate this with dulwich.fastexport.split_email and # dulwich.repo.check_user_identity (name_str, email_str) = text.rsplit(b"<", 1) name_str = name_str.strip() email_str = email_str.rstrip(b">").strip() name: bytes | None = name_str if name_str else None email: bytes | None = email_str if email_str else None return (name, email) def read_mailmap( f: IO[bytes], ) -> Iterator[ tuple[ tuple[bytes | None, bytes | None], tuple[bytes | None, bytes | None] | None, ] ]: """Read a mailmap. Args: f: File-like object to read from Returns: Iterator over ((canonical_name, canonical_email), (from_name, from_email)) tuples """ for line in f: # Remove comments line = line.split(b"#")[0] line = line.strip() if not line: continue (canonical_identity, from_identity) = line.split(b">", 1) canonical_identity += b">" if from_identity.strip(): parsed_from_identity = parse_identity(from_identity) else: parsed_from_identity = None parsed_canonical_identity = parse_identity(canonical_identity) yield parsed_canonical_identity, parsed_from_identity class Mailmap: """Class for accessing a mailmap file.""" def __init__( self, map: Iterator[ tuple[ tuple[bytes | None, bytes | None], tuple[bytes | None, bytes | None] | None, ] ] | None = None, ) -> None: """Initialize Mailmap. Args: map: Optional iterator of (canonical_identity, from_identity) tuples """ self._table: dict[ tuple[bytes | None, bytes | None], tuple[bytes | None, bytes | None], ] = {} if map: for canonical_identity, from_identity in map: self.add_entry(canonical_identity, from_identity) def add_entry( self, canonical_identity: tuple[bytes | None, bytes | None], from_identity: tuple[bytes | None, bytes | None] | None = None, ) -> None: """Add an entry to the mail mail. Any of the fields can be None, but at least one of them needs to be set. Args: canonical_identity: The canonical identity (tuple) from_identity: The from identity (tuple) """ if from_identity is None: from_name, from_email = None, None else: (from_name, from_email) = from_identity (canonical_name, canonical_email) = canonical_identity if from_name is None and from_email is None: self._table[canonical_name, None] = canonical_identity self._table[None, canonical_email] = canonical_identity else: self._table[from_name, from_email] = canonical_identity def lookup( self, identity: bytes | tuple[bytes | None, bytes | None] ) -> bytes | tuple[bytes | None, bytes | None]: """Lookup an identity in this mailmail.""" if not isinstance(identity, tuple): was_tuple = False identity = parse_identity(identity) else: was_tuple = True for query in [identity, (None, identity[1]), (identity[0], None)]: canonical_identity = self._table.get(query) if canonical_identity is not None: identity = ( canonical_identity[0] or identity[0], canonical_identity[1] or identity[1], ) break if was_tuple: return identity else: name, email = identity if name is None: name = b"" if email is None: email = b"" return name + b" <" + email + b">" @classmethod def from_path(cls, path: str) -> "Mailmap": """Create Mailmap from file path. Args: path: Path to mailmap file Returns: Mailmap instance """ with open(path, "rb") as f: return cls(read_mailmap(f)) dulwich-1.0.0/dulwich/maintenance.py000066400000000000000000000366121513301442600174510ustar00rootroot00000000000000# maintenance.py -- Git maintenance implementation # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git maintenance implementation. This module provides the git maintenance functionality for optimizing and maintaining Git repositories. """ __all__ = [ "CommitGraphTask", "GcTask", "IncrementalRepackTask", "LooseObjectsTask", "MaintenanceResult", "MaintenanceSchedule", "MaintenanceTask", "PackRefsTask", "PrefetchTask", "get_enabled_tasks", "register_repository", "run_maintenance", "unregister_repository", ] import logging import os from abc import ABC, abstractmethod from collections.abc import Callable from dataclasses import dataclass, field from enum import Enum from typing import TYPE_CHECKING if TYPE_CHECKING: from .repo import BaseRepo, Repo logger = logging.getLogger(__name__) class MaintenanceSchedule(str, Enum): """Maintenance schedule types.""" HOURLY = "hourly" DAILY = "daily" WEEKLY = "weekly" @dataclass class MaintenanceResult: """Result from running maintenance tasks.""" tasks_run: list[str] = field(default_factory=list) tasks_succeeded: list[str] = field(default_factory=list) tasks_failed: list[str] = field(default_factory=list) errors: dict[str, str] = field(default_factory=dict) class MaintenanceTask(ABC): """Base class for maintenance tasks.""" name: str = "" def __init__( self, repo: "BaseRepo", auto: bool = False, progress: Callable[[str], None] | None = None, ) -> None: """Initialize maintenance task. Args: repo: Repository object auto: If True, only run if needed progress: Optional progress callback """ self.repo = repo self.auto = auto self.progress = progress @abstractmethod def run(self) -> bool: """Run the maintenance task. Returns: True if successful, False otherwise """ def is_enabled(self) -> bool: """Check if task is enabled in repository configuration. Returns: True if task is enabled """ if not self.name: return False config = self.repo.get_config() try: enabled = config.get_boolean( (b"maintenance", self.name.encode()), b"enabled" ) return enabled if enabled is not None else self.default_enabled() except KeyError: # Return default enabled state return self.default_enabled() def default_enabled(self) -> bool: """Return default enabled state for this task. Returns: True if task should be enabled by default """ return False class GcTask(MaintenanceTask): """Garbage collection maintenance task.""" name = "gc" def default_enabled(self) -> bool: """GC is enabled by default.""" return True def run(self) -> bool: """Run garbage collection. Returns: True if successful, False otherwise """ from .gc import garbage_collect from .repo import Repo if self.progress: self.progress("Running gc task") assert isinstance(self.repo, Repo) garbage_collect(self.repo, auto=self.auto, progress=self.progress) return True class CommitGraphTask(MaintenanceTask): """Commit-graph maintenance task.""" name = "commit-graph" def default_enabled(self) -> bool: """Commit-graph is enabled by default.""" return True def run(self) -> bool: """Update commit-graph file. Returns: True if successful, False otherwise """ if self.progress: self.progress("Running commit-graph task") # Get all refs refs = list(self.repo.refs.as_dict().values()) if refs: self.repo.object_store.write_commit_graph(refs, reachable=True) return True class LooseObjectsTask(MaintenanceTask): """Loose-objects maintenance task. This packs loose objects that are not already packed. """ name = "loose-objects" def run(self) -> bool: """Pack loose objects. Returns: True if successful, False otherwise """ from .object_store import PackBasedObjectStore if self.progress: self.progress("Running loose-objects task") # Pack loose objects using the object store's method assert isinstance(self.repo.object_store, PackBasedObjectStore) count = self.repo.object_store.pack_loose_objects(progress=self.progress) if self.progress and count > 0: self.progress(f"Packed {count} loose objects") return True class IncrementalRepackTask(MaintenanceTask): """Incremental-repack maintenance task. This consolidates pack files incrementally. """ name = "incremental-repack" def run(self) -> bool: """Consolidate pack files incrementally. Returns: True if successful, False otherwise """ from .object_store import PackBasedObjectStore if self.progress: self.progress("Running incremental-repack task") # Get all packs sorted by size assert isinstance(self.repo.object_store, PackBasedObjectStore) packs = self.repo.object_store.packs if len(packs) <= 1: # Nothing to consolidate if self.progress: self.progress("No packs to consolidate") return True # In auto mode, only repack if there are many small packs # This is a heuristic similar to git's auto gc behavior if self.auto: # Only repack if we have more than 50 packs # (matching git's gc.autoPackLimit default) if len(packs) < 50: if self.progress: self.progress( f"Skipping incremental repack: only {len(packs)} packs" ) return True # Perform a full repack to consolidate all packs if self.progress: self.progress(f"Consolidating {len(packs)} pack files") count = self.repo.object_store.repack(progress=self.progress) if self.progress: self.progress(f"Repacked {count} objects") return True class PackRefsTask(MaintenanceTask): """Pack-refs maintenance task.""" name = "pack-refs" def run(self) -> bool: """Pack loose references. Returns: True if successful, False otherwise """ if self.progress: self.progress("Running pack-refs task") self.repo.refs.pack_refs(all=True) return True class PrefetchTask(MaintenanceTask): """Prefetch maintenance task. This prefetches remote refs to keep the object database up-to-date. """ name = "prefetch" def run(self) -> bool: """Prefetch remote refs. Returns: True if successful, False otherwise """ from .porcelain import fetch from .repo import Repo if self.progress: self.progress("Running prefetch task") config = self.repo.get_config() # Get all configured remotes remotes = set() for section in config.sections(): if len(section) == 2 and section[0] == b"remote": remotes.add(section[1].decode()) if not remotes: if self.progress: self.progress("No remotes configured, skipping prefetch") return True # Fetch from each remote success = True for remote_name in sorted(remotes): try: if self.progress: self.progress(f"Fetching from {remote_name}") # Fetch quietly without updating working tree # The fetch operation will update refs under refs/remotes/ assert isinstance(self.repo, Repo) fetch( self.repo, remote_location=remote_name, quiet=True, ) except Exception as e: # Log error and mark as failed logger.error(f"Failed to fetch from {remote_name}: {e}") success = False return success # Registry of available maintenance tasks MAINTENANCE_TASKS: dict[str, type[MaintenanceTask]] = { "gc": GcTask, "commit-graph": CommitGraphTask, "loose-objects": LooseObjectsTask, "incremental-repack": IncrementalRepackTask, "pack-refs": PackRefsTask, "prefetch": PrefetchTask, } def get_enabled_tasks( repo: "BaseRepo", task_filter: list[str] | None = None, ) -> list[str]: """Get list of enabled maintenance tasks. Args: repo: Repository object task_filter: Optional list of specific task names to run Returns: List of enabled task names """ if task_filter: # Validate requested tasks exist return [name for name in task_filter if name in MAINTENANCE_TASKS] enabled_tasks = [] # Check each task to see if it's enabled for task_name, task_class in MAINTENANCE_TASKS.items(): # Create temporary task instance to check if enabled task = task_class(repo, auto=False, progress=None) if task.is_enabled(): enabled_tasks.append(task_name) return enabled_tasks def run_maintenance( repo: "BaseRepo", tasks: list[str] | None = None, auto: bool = False, progress: Callable[[str], None] | None = None, ) -> MaintenanceResult: """Run maintenance tasks on a repository. Args: repo: Repository object tasks: Optional list of specific task names to run auto: If True, only run tasks if needed progress: Optional progress callback Returns: MaintenanceResult with task execution results """ result = MaintenanceResult() enabled_tasks = get_enabled_tasks(repo, tasks) for task_name in enabled_tasks: result.tasks_run.append(task_name) task_class = MAINTENANCE_TASKS.get(task_name) if not task_class: result.tasks_failed.append(task_name) result.errors[task_name] = "Unknown task" continue try: task = task_class(repo, auto=auto, progress=progress) success = task.run() if success: result.tasks_succeeded.append(task_name) else: result.tasks_failed.append(task_name) except Exception as e: result.tasks_failed.append(task_name) result.errors[task_name] = str(e) logger.error(f"Task {task_name} failed: {e}") return result def register_repository(repo: "Repo") -> None: """Register a repository for background maintenance. This adds the repository to the global maintenance.repo config and sets up recommended configuration for scheduled maintenance. Args: repo: Repository to register """ from .config import ConfigFile repo_path = os.path.abspath(repo.path) # Get global config path global_config_path = os.path.expanduser("~/.gitconfig") try: global_config = ConfigFile.from_path(global_config_path) except FileNotFoundError: # Create new config file if it doesn't exist global_config = ConfigFile() global_config.path = global_config_path # Add repository to maintenance.repo list # Check if already registered repo_path_bytes = repo_path.encode() try: existing_repos = list(global_config.get_multivar((b"maintenance",), b"repo")) except KeyError: existing_repos = [] if repo_path_bytes in existing_repos: # Already registered return # Add to global config global_config.set((b"maintenance",), b"repo", repo_path_bytes) # Set up incremental strategy in global config if not already set try: global_config.get((b"maintenance",), b"strategy") except KeyError: global_config.set((b"maintenance",), b"strategy", b"incremental") # Configure task schedules for incremental strategy schedule_config = { b"commit-graph": b"hourly", b"prefetch": b"hourly", b"loose-objects": b"daily", b"incremental-repack": b"daily", } for task, schedule in schedule_config.items(): try: global_config.get((b"maintenance", task), b"schedule") except KeyError: global_config.set((b"maintenance", task), b"schedule", schedule) global_config.write_to_path() # Disable foreground auto maintenance in the repository repo_config = repo.get_config() repo_config.set((b"maintenance",), b"auto", False) repo_config.write_to_path() def unregister_repository(repo: "Repo", force: bool = False) -> None: """Unregister a repository from background maintenance. This removes the repository from the global maintenance.repo config. Args: repo: Repository to unregister force: If True, don't error if repository is not registered Raises: ValueError: If repository is not registered and force is False """ from .config import ConfigFile repo_path = os.path.abspath(repo.path) # Get global config global_config_path = os.path.expanduser("~/.gitconfig") try: global_config = ConfigFile.from_path(global_config_path) except FileNotFoundError: if not force: raise ValueError( f"Repository {repo_path} is not registered for maintenance" ) return # Check if repository is registered repo_path_bytes = repo_path.encode() try: existing_repos = list(global_config.get_multivar((b"maintenance",), b"repo")) except KeyError: if not force: raise ValueError( f"Repository {repo_path} is not registered for maintenance" ) return if repo_path_bytes not in existing_repos: if not force: raise ValueError( f"Repository {repo_path} is not registered for maintenance" ) return # Remove from list existing_repos.remove(repo_path_bytes) # Delete the maintenance section and recreate it with remaining repos try: del global_config[(b"maintenance",)] except KeyError: pass # Re-add remaining repos for remaining_repo in existing_repos: global_config.set((b"maintenance",), b"repo", remaining_repo) global_config.write_to_path() dulwich-1.0.0/dulwich/mbox.py000066400000000000000000000242571513301442600161360ustar00rootroot00000000000000# mbox.py -- For dealing with mbox files # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Classes for dealing with mbox files and Maildir. This module provides functionality to split mbox files and Maildir into individual message files, similar to git mailsplit, and to extract patch information from email messages, similar to git mailinfo. """ __all__ = [ "mailinfo", "split_maildir", "split_mbox", ] import mailbox import os from collections.abc import Iterable, Iterator from pathlib import Path from typing import TYPE_CHECKING, BinaryIO, TextIO if TYPE_CHECKING: from .patch import MailinfoResult def split_mbox( input_file: str | bytes | BinaryIO, output_dir: str | bytes | Path, start_number: int = 1, precision: int = 4, keep_cr: bool = False, mboxrd: bool = False, ) -> list[str]: r"""Split an mbox file into individual message files. Args: input_file: Path to mbox file or file-like object. If None, reads from stdin. output_dir: Directory where individual messages will be written start_number: Starting number for output files (default: 1) precision: Number of digits for output filenames (default: 4) keep_cr: If True, preserve \r in lines ending with \r\n (default: False) mboxrd: If True, treat input as mboxrd format and reverse escaping (default: False) Returns: List of output file paths that were created Raises: ValueError: If output_dir doesn't exist or isn't a directory OSError: If there are issues reading/writing files """ # Convert output_dir to Path for easier manipulation if isinstance(output_dir, bytes): output_dir = output_dir.decode("utf-8") output_path = Path(output_dir) if not output_path.exists(): raise ValueError(f"Output directory does not exist: {output_dir}") if not output_path.is_dir(): raise ValueError(f"Output path is not a directory: {output_dir}") # Open the mbox file mbox_obj: mailbox.mbox | None = None mbox_iter: Iterable[mailbox.mboxMessage] if isinstance(input_file, (str, bytes)): if isinstance(input_file, bytes): input_file = input_file.decode("utf-8") mbox_obj = mailbox.mbox(input_file) mbox_iter = mbox_obj else: # For file-like objects, we need to read and parse manually mbox_iter = _parse_mbox_from_file(input_file) try: output_files = [] msg_number = start_number for message in mbox_iter: # Format the output filename with the specified precision output_filename = f"{msg_number:0{precision}d}" output_file_path = output_path / output_filename # Write the message to the output file with open(output_file_path, "wb") as f: message_bytes = bytes(message) # Handle mboxrd format - reverse the escaping if mboxrd: message_bytes = _reverse_mboxrd_escaping(message_bytes) # Handle CR/LF if needed if not keep_cr: message_bytes = message_bytes.replace(b"\r\n", b"\n") # Strip trailing newlines (mailbox module adds separator newlines) message_bytes = message_bytes.rstrip(b"\n") if message_bytes: message_bytes += b"\n" f.write(message_bytes) output_files.append(str(output_file_path)) msg_number += 1 return output_files finally: if mbox_obj is not None: mbox_obj.close() def split_maildir( maildir_path: str | bytes | Path, output_dir: str | bytes | Path, start_number: int = 1, precision: int = 4, keep_cr: bool = False, ) -> list[str]: r"""Split a Maildir into individual message files. Maildir splitting relies upon filenames being sorted to output patches in the correct order. Args: maildir_path: Path to the Maildir directory (should contain cur, tmp, new subdirectories) output_dir: Directory where individual messages will be written start_number: Starting number for output files (default: 1) precision: Number of digits for output filenames (default: 4) keep_cr: If True, preserve \r in lines ending with \r\n (default: False) Returns: List of output file paths that were created Raises: ValueError: If maildir_path or output_dir don't exist or aren't valid OSError: If there are issues reading/writing files """ # Convert paths to Path objects if isinstance(maildir_path, bytes): maildir_path = maildir_path.decode("utf-8") if isinstance(output_dir, bytes): output_dir = output_dir.decode("utf-8") maildir = Path(maildir_path) output_path = Path(output_dir) if not maildir.exists(): raise ValueError(f"Maildir does not exist: {maildir_path}") if not maildir.is_dir(): raise ValueError(f"Maildir path is not a directory: {maildir_path}") if not output_path.exists(): raise ValueError(f"Output directory does not exist: {output_dir}") if not output_path.is_dir(): raise ValueError(f"Output path is not a directory: {output_dir}") # Open the Maildir md = mailbox.Maildir(str(maildir), factory=None) try: # Get all messages and sort by their keys to ensure consistent ordering sorted_keys = sorted(md.keys()) output_files = [] msg_number = start_number for key in sorted_keys: message = md[key] # Format the output filename with the specified precision output_filename = f"{msg_number:0{precision}d}" output_file_path = output_path / output_filename # Write the message to the output file with open(output_file_path, "wb") as f: message_bytes = bytes(message) # Handle CR/LF if needed if not keep_cr: message_bytes = message_bytes.replace(b"\r\n", b"\n") f.write(message_bytes) output_files.append(str(output_file_path)) msg_number += 1 return output_files finally: md.close() def _parse_mbox_from_file(file_obj: BinaryIO) -> Iterator[mailbox.mboxMessage]: """Parse mbox format from a file-like object. Args: file_obj: Binary file-like object containing mbox data Yields: Individual mboxMessage objects """ import tempfile # Create a temporary file to hold the mbox data with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tmp: tmp.write(file_obj.read()) tmp_path = tmp.name mbox = mailbox.mbox(tmp_path) try: yield from mbox finally: mbox.close() os.unlink(tmp_path) def _reverse_mboxrd_escaping(message_bytes: bytes) -> bytes: """Reverse mboxrd escaping (^>+From lines). In mboxrd format, lines matching ^>+From have one leading ">" removed. Args: message_bytes: Message content with mboxrd escaping Returns: Message content with escaping reversed """ lines = message_bytes.split(b"\n") result_lines = [] for line in lines: # Check if line matches the pattern ^>+From (one or more > followed by From) if line.startswith(b">") and line.lstrip(b">").startswith(b"From "): # Remove one leading ">" result_lines.append(line[1:]) else: result_lines.append(line) return b"\n".join(result_lines) def mailinfo( input_file: str | bytes | BinaryIO | TextIO, keep_subject: bool = False, keep_non_patch: bool = False, encoding: str | None = None, scissors: bool = False, message_id: bool = False, ) -> "MailinfoResult": """Extract patch information from an email message. High-level wrapper around patch.mailinfo() that handles file I/O. Args: input_file: Path to email file or file-like object (binary or text) keep_subject: If True, keep subject intact without munging (-k) keep_non_patch: If True, only strip [PATCH] from brackets (-b) encoding: Character encoding to use (default: detect from message) scissors: If True, remove everything before scissors line message_id: If True, include Message-ID in commit message (-m) Returns: MailinfoResult with parsed information (from patch.mailinfo) Raises: ValueError: If message is malformed or missing required fields OSError: If there are issues reading the file """ from .patch import mailinfo as patch_mailinfo # Handle file path input if isinstance(input_file, (str, bytes)): if isinstance(input_file, bytes): input_file = input_file.decode("utf-8") with open(input_file, "rb") as f: return patch_mailinfo( f, keep_subject=keep_subject, keep_non_patch=keep_non_patch, encoding=encoding, scissors=scissors, message_id=message_id, ) # Handle file-like objects return patch_mailinfo( input_file, keep_subject=keep_subject, keep_non_patch=keep_non_patch, encoding=encoding, scissors=scissors, message_id=message_id, ) dulwich-1.0.0/dulwich/merge.py000066400000000000000000000700621513301442600162630ustar00rootroot00000000000000# merge.py -- Git merge implementation # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git merge implementation.""" __all__ = [ "MergeConflict", "Merger", "make_merge3", "merge_blobs", "octopus_merge", "recursive_merge", "three_way_merge", ] from collections.abc import Sequence from typing import TYPE_CHECKING if TYPE_CHECKING: import merge3 from merge3 import SequenceMatcherProtocol else: try: import merge3 except ImportError: merge3 = None # type: ignore[assignment] from dulwich.attrs import GitAttributes from dulwich.config import Config from dulwich.merge_drivers import get_merge_driver_registry from dulwich.object_store import BaseObjectStore from dulwich.objects import S_ISGITLINK, Blob, Commit, ObjectID, Tree, is_blob, is_tree def make_merge3( base: Sequence[bytes], a: Sequence[bytes], b: Sequence[bytes], is_cherrypick: bool = False, sequence_matcher: "type[SequenceMatcherProtocol[bytes]] | None" = None, ) -> "merge3.Merge3[bytes]": """Return a Merge3 object, or raise ImportError if merge3 is not installed.""" if merge3 is None: raise ImportError( "merge3 module is required for three-way merging. " "Install it with: pip install merge3" ) return merge3.Merge3(base, a, b, is_cherrypick, sequence_matcher) class MergeConflict(Exception): """Raised when a merge conflict occurs.""" def __init__(self, path: bytes, message: str) -> None: """Initialize MergeConflict. Args: path: Path to the conflicted file message: Conflict description """ self.path = path super().__init__(f"Merge conflict in {path!r}: {message}") def _can_merge_lines( base_lines: Sequence[bytes], a_lines: Sequence[bytes], b_lines: Sequence[bytes] ) -> bool: """Check if lines can be merged without conflict.""" # If one side is unchanged, we can take the other side if base_lines == a_lines: return True elif base_lines == b_lines: return True else: # For now, treat any difference as a conflict # A more sophisticated algorithm would check for non-overlapping changes return False if merge3 is not None: def _merge3_to_bytes(m: "merge3.Merge3[bytes]") -> bytes: """Convert merge3 result to bytes with conflict markers. Args: m: Merge3 object Returns: Merged content as bytes """ result: list[bytes] = [] for group in m.merge_groups(): # type: ignore[no-untyped-call,unused-ignore] if group[0] == "unchanged": result.extend(group[1]) elif group[0] == "a": result.extend(group[1]) elif group[0] == "b": result.extend(group[1]) elif group[0] == "same": result.extend(group[1]) elif group[0] == "conflict": # Check if this is a real conflict or just different changes base_lines, a_lines, b_lines = group[1], group[2], group[3] # Try to merge line by line if _can_merge_lines(base_lines, a_lines, b_lines): merged_lines = _merge_lines(base_lines, a_lines, b_lines) result.extend(merged_lines) else: # Real conflict - add conflict markers result.append(b"<<<<<<< ours\n") result.extend(a_lines) result.append(b"=======\n") result.extend(b_lines) result.append(b">>>>>>> theirs\n") return b"".join(result) def _merge_lines( base_lines: Sequence[bytes], a_lines: Sequence[bytes], b_lines: Sequence[bytes] ) -> Sequence[bytes]: """Merge lines when possible.""" if base_lines == a_lines: return b_lines elif base_lines == b_lines: return a_lines else: # This shouldn't happen if _can_merge_lines returned True return a_lines def merge_blobs( base_blob: Blob | None, ours_blob: Blob | None, theirs_blob: Blob | None, path: bytes | None = None, gitattributes: GitAttributes | None = None, config: Config | None = None, ) -> tuple[bytes, bool]: """Perform three-way merge on blob contents. Args: base_blob: Common ancestor blob (can be None) ours_blob: Our version of the blob (can be None) theirs_blob: Their version of the blob (can be None) path: Optional path of the file being merged gitattributes: Optional GitAttributes object for checking merge drivers config: Optional Config object for loading merge driver configuration Returns: Tuple of (merged_content, had_conflicts) """ # Check for merge driver merge_driver_name = None if path and gitattributes: attrs = gitattributes.match_path(path) merge_value = attrs.get(b"merge") if merge_value and isinstance(merge_value, bytes) and merge_value != b"text": merge_driver_name = merge_value.decode("utf-8", errors="replace") # Use merge driver if found if merge_driver_name: registry = get_merge_driver_registry(config) driver = registry.get_driver(merge_driver_name) if driver: # Get content from blobs base_content = base_blob.data if base_blob else b"" ours_content = ours_blob.data if ours_blob else b"" theirs_content = theirs_blob.data if theirs_blob else b"" # Use merge driver merged_content, success = driver.merge( ancestor=base_content, ours=ours_content, theirs=theirs_content, path=path.decode("utf-8", errors="replace") if path else None, marker_size=7, ) # Convert success (no conflicts) to had_conflicts (conflicts occurred) had_conflicts = not success return merged_content, had_conflicts # Fall back to default merge behavior # Handle deletion cases if ours_blob is None and theirs_blob is None: return b"", False if base_blob is None: # No common ancestor if ours_blob is None: assert theirs_blob is not None return theirs_blob.data, False elif theirs_blob is None: return ours_blob.data, False elif ours_blob.data == theirs_blob.data: return ours_blob.data, False else: # Both added different content - conflict m = make_merge3( [], ours_blob.data.splitlines(True), theirs_blob.data.splitlines(True), ) return _merge3_to_bytes(m), True # Get content for each version base_content = base_blob.data if base_blob else b"" ours_content = ours_blob.data if ours_blob else b"" theirs_content = theirs_blob.data if theirs_blob else b"" # Check if either side deleted if ours_blob is None or theirs_blob is None: if ours_blob is None and theirs_blob is None: return b"", False elif ours_blob is None: # We deleted, check if they modified if base_content == theirs_content: return b"", False # They didn't modify, accept deletion else: # Conflict: we deleted, they modified m = make_merge3( base_content.splitlines(True), [], theirs_content.splitlines(True), ) return _merge3_to_bytes(m), True else: # They deleted, check if we modified if base_content == ours_content: return b"", False # We didn't modify, accept deletion else: # Conflict: they deleted, we modified m = make_merge3( base_content.splitlines(True), ours_content.splitlines(True), [], ) return _merge3_to_bytes(m), True # Both sides exist, check if merge is needed if ours_content == theirs_content: return ours_content, False elif base_content == ours_content: return theirs_content, False elif base_content == theirs_content: return ours_content, False # Perform three-way merge m = make_merge3( base_content.splitlines(True), ours_content.splitlines(True), theirs_content.splitlines(True), ) # Check for conflicts and generate merged content merged_content = _merge3_to_bytes(m) has_conflicts = b"<<<<<<< ours" in merged_content return merged_content, has_conflicts class Merger: """Handles git merge operations.""" def __init__( self, object_store: BaseObjectStore, gitattributes: GitAttributes | None = None, config: Config | None = None, ) -> None: """Initialize merger. Args: object_store: Object store to read objects from gitattributes: Optional GitAttributes object for checking merge drivers config: Optional Config object for loading merge driver configuration """ self.object_store = object_store self.gitattributes = gitattributes self.config = config def merge_blobs( self, base_blob: Blob | None, ours_blob: Blob | None, theirs_blob: Blob | None, path: bytes | None = None, ) -> tuple[bytes, bool]: """Perform three-way merge on blob contents. Args: base_blob: Common ancestor blob (can be None) ours_blob: Our version of the blob (can be None) theirs_blob: Their version of the blob (can be None) path: Optional path of the file being merged Returns: Tuple of (merged_content, had_conflicts) """ return merge_blobs( base_blob, ours_blob, theirs_blob, path, self.gitattributes, self.config ) def merge_trees( self, base_tree: Tree | None, ours_tree: Tree, theirs_tree: Tree ) -> tuple[Tree, list[bytes]]: """Perform three-way merge on trees. Args: base_tree: Common ancestor tree (can be None for no common ancestor) ours_tree: Our version of the tree theirs_tree: Their version of the tree Returns: tuple of (merged_tree, list_of_conflicted_paths) """ conflicts: list[bytes] = [] merged_entries: dict[bytes, tuple[int | None, ObjectID | None]] = {} # Get all paths from all trees all_paths = set() if base_tree: for entry in base_tree.items(): assert entry.path is not None all_paths.add(entry.path) for entry in ours_tree.items(): assert entry.path is not None all_paths.add(entry.path) for entry in theirs_tree.items(): assert entry.path is not None all_paths.add(entry.path) # Process each path for path in sorted(all_paths): base_entry = None if base_tree: try: base_entry = base_tree.lookup_path( self.object_store.__getitem__, path ) except KeyError: pass try: ours_entry = ours_tree.lookup_path(self.object_store.__getitem__, path) except KeyError: ours_entry = None try: theirs_entry = theirs_tree.lookup_path( self.object_store.__getitem__, path ) except KeyError: theirs_entry = None # Extract mode and sha _base_mode, base_sha = base_entry if base_entry else (None, None) ours_mode, ours_sha = ours_entry if ours_entry else (None, None) theirs_mode, theirs_sha = theirs_entry if theirs_entry else (None, None) # Handle deletions if ours_sha is None and theirs_sha is None: continue # Deleted in both # Handle additions if base_sha is None: if ours_sha == theirs_sha and ours_mode == theirs_mode: # Same addition in both merged_entries[path] = (ours_mode, ours_sha) elif ours_sha is None: # Added only in theirs merged_entries[path] = (theirs_mode, theirs_sha) elif theirs_sha is None: # Added only in ours merged_entries[path] = (ours_mode, ours_sha) else: # Different additions - conflict conflicts.append(path) # For now, keep ours merged_entries[path] = (ours_mode, ours_sha) continue # Check for mode conflicts if ( ours_mode != theirs_mode and ours_mode is not None and theirs_mode is not None ): conflicts.append(path) # For now, keep ours merged_entries[path] = (ours_mode, ours_sha) continue # Handle modifications if ours_sha == theirs_sha: # Same modification or no change if ours_sha is not None: merged_entries[path] = (ours_mode, ours_sha) elif base_sha == ours_sha and theirs_sha is not None: # Only theirs modified merged_entries[path] = (theirs_mode, theirs_sha) elif base_sha == theirs_sha and ours_sha is not None: # Only ours modified merged_entries[path] = (ours_mode, ours_sha) elif ours_sha is None: # We deleted if base_sha == theirs_sha: # They didn't modify, accept deletion pass else: # They modified, we deleted - conflict conflicts.append(path) elif theirs_sha is None: # They deleted if base_sha == ours_sha: # We didn't modify, accept deletion pass else: # We modified, they deleted - conflict conflicts.append(path) merged_entries[path] = (ours_mode, ours_sha) else: # Both modified differently # For trees and submodules, this is a conflict if S_ISGITLINK(ours_mode or 0) or S_ISGITLINK(theirs_mode or 0): conflicts.append(path) merged_entries[path] = (ours_mode, ours_sha) elif (ours_mode or 0) & 0o170000 == 0o040000 or ( theirs_mode or 0 ) & 0o170000 == 0o040000: # Tree conflict conflicts.append(path) merged_entries[path] = (ours_mode, ours_sha) else: # Try to merge blobs base_blob = None if base_sha: base_obj = self.object_store[base_sha] if is_blob(base_obj): base_blob = base_obj else: raise TypeError( f"Expected blob for {path!r}, got {base_obj.type_name.decode()}" ) ours_blob = None if ours_sha: ours_obj = self.object_store[ours_sha] if is_blob(ours_obj): ours_blob = ours_obj else: raise TypeError( f"Expected blob for {path!r}, got {ours_obj.type_name.decode()}" ) theirs_blob = None if theirs_sha: theirs_obj = self.object_store[theirs_sha] if is_blob(theirs_obj): theirs_blob = theirs_obj else: raise TypeError( f"Expected blob for {path!r}, got {theirs_obj.type_name.decode()}" ) assert isinstance(base_blob, Blob) assert isinstance(ours_blob, Blob) assert isinstance(theirs_blob, Blob) merged_content, had_conflict = self.merge_blobs( base_blob, ours_blob, theirs_blob, path ) if had_conflict: conflicts.append(path) # Store merged blob merged_blob = Blob.from_string(merged_content) self.object_store.add_object(merged_blob) merged_entries[path] = (ours_mode or theirs_mode, merged_blob.id) # Build merged tree merged_tree = Tree() for path, (mode, sha) in sorted(merged_entries.items()): if mode is not None and sha is not None: merged_tree.add(path, mode, sha) return merged_tree, conflicts def _create_virtual_commit( object_store: BaseObjectStore, tree: Tree, parents: list[ObjectID], message: bytes = b"Virtual merge base", ) -> Commit: """Create a virtual commit object for recursive merging. Args: object_store: Object store to add the commit to tree: Tree object for the commit parents: List of parent commit IDs message: Commit message Returns: The created Commit object """ # Add the tree to the object store object_store.add_object(tree) # Create a virtual commit commit = Commit() commit.tree = tree.id commit.parents = parents commit.author = b"Dulwich Recursive Merge " commit.committer = commit.author commit.commit_time = 0 commit.author_time = 0 commit.commit_timezone = 0 commit.author_timezone = 0 commit.encoding = b"UTF-8" commit.message = message # Add the commit to the object store object_store.add_object(commit) return commit def recursive_merge( object_store: BaseObjectStore, merge_bases: list[ObjectID], ours_commit: Commit, theirs_commit: Commit, gitattributes: GitAttributes | None = None, config: Config | None = None, ) -> tuple[Tree, list[bytes]]: """Perform a recursive merge with multiple merge bases. This implements Git's recursive merge strategy, which handles cases where there are multiple common ancestors (criss-cross merges). The algorithm: 1. If there's 0 or 1 merge base, perform a simple three-way merge 2. If there are multiple merge bases, merge them recursively to create a virtual merge base, then use that for the final three-way merge Args: object_store: Object store to read/write objects merge_bases: List of merge base commit IDs ours_commit: Our commit theirs_commit: Their commit gitattributes: Optional GitAttributes object for checking merge drivers config: Optional Config object for loading merge driver configuration Returns: tuple of (merged_tree, list_of_conflicted_paths) """ if not merge_bases: # No common ancestor - use None as base return three_way_merge( object_store, None, ours_commit, theirs_commit, gitattributes, config ) elif len(merge_bases) == 1: # Single merge base - simple three-way merge base_commit_obj = object_store[merge_bases[0]] if not isinstance(base_commit_obj, Commit): raise TypeError( f"Expected commit, got {base_commit_obj.type_name.decode()}" ) return three_way_merge( object_store, base_commit_obj, ours_commit, theirs_commit, gitattributes, config, ) else: # Multiple merge bases - need to create a virtual merge base # Start by merging the first two bases virtual_base_id = merge_bases[0] virtual_commit_obj = object_store[virtual_base_id] if not isinstance(virtual_commit_obj, Commit): raise TypeError( f"Expected commit, got {virtual_commit_obj.type_name.decode()}" ) # Recursively merge each additional base for next_base_id in merge_bases[1:]: next_base_obj = object_store[next_base_id] if not isinstance(next_base_obj, Commit): raise TypeError( f"Expected commit, got {next_base_obj.type_name.decode()}" ) # Find merge base of these two bases # Import here to avoid circular dependency # We need access to the repo for find_merge_base # For now, we'll perform a simple three-way merge without recursion # between the two virtual commits # A proper implementation would require passing the repo object # Perform three-way merge of the two bases (using None as their base) merged_tree, _conflicts = three_way_merge( object_store, None, # No common ancestor for virtual merge bases virtual_commit_obj, next_base_obj, gitattributes, config, ) # Create a virtual commit with this merged tree virtual_commit_obj = _create_virtual_commit( object_store, merged_tree, [virtual_base_id, next_base_id], ) virtual_base_id = virtual_commit_obj.id # Now use the virtual merge base for the final merge return three_way_merge( object_store, virtual_commit_obj, ours_commit, theirs_commit, gitattributes, config, ) def three_way_merge( object_store: BaseObjectStore, base_commit: Commit | None, ours_commit: Commit, theirs_commit: Commit, gitattributes: GitAttributes | None = None, config: Config | None = None, ) -> tuple[Tree, list[bytes]]: """Perform a three-way merge between commits. Args: object_store: Object store to read/write objects base_commit: Common ancestor commit (None if no common ancestor) ours_commit: Our commit theirs_commit: Their commit gitattributes: Optional GitAttributes object for checking merge drivers config: Optional Config object for loading merge driver configuration Returns: tuple of (merged_tree, list_of_conflicted_paths) """ merger = Merger(object_store, gitattributes, config) base_tree = None if base_commit: base_obj = object_store[base_commit.tree] if is_tree(base_obj): base_tree = base_obj else: raise TypeError(f"Expected tree, got {base_obj.type_name.decode()}") ours_obj = object_store[ours_commit.tree] if is_tree(ours_obj): ours_tree = ours_obj else: raise TypeError(f"Expected tree, got {ours_obj.type_name.decode()}") theirs_obj = object_store[theirs_commit.tree] if is_tree(theirs_obj): theirs_tree = theirs_obj else: raise TypeError(f"Expected tree, got {theirs_obj.type_name.decode()}") assert base_tree is None or isinstance(base_tree, Tree) assert isinstance(ours_tree, Tree) assert isinstance(theirs_tree, Tree) return merger.merge_trees(base_tree, ours_tree, theirs_tree) def octopus_merge( object_store: BaseObjectStore, merge_bases: list[ObjectID], head_commit: Commit, other_commits: list[Commit], gitattributes: GitAttributes | None = None, config: Config | None = None, ) -> tuple[Tree, list[bytes]]: """Perform an octopus merge of multiple commits. The octopus merge strategy merges multiple branches sequentially into a single commit with multiple parents. It refuses to proceed if any merge would result in conflicts that require manual resolution. Args: object_store: Object store to read/write objects merge_bases: List of common ancestor commit IDs for all commits head_commit: Current HEAD commit (ours) other_commits: List of commits to merge (theirs) gitattributes: Optional GitAttributes object for checking merge drivers config: Optional Config object for loading merge driver configuration Returns: tuple of (merged_tree, list_of_conflicted_paths) If any conflicts occur during the sequential merges, the function returns early with the conflicts list populated. Raises: TypeError: If any object is not of the expected type """ if not other_commits: raise ValueError("octopus_merge requires at least one commit to merge") # Start with the head commit's tree as our current state current_commit = head_commit # Merge each commit sequentially for i, other_commit in enumerate(other_commits): # Find the merge base between current state and the commit we're merging # For octopus merges, we use the octopus base for all commits if merge_bases: base_commit_id = merge_bases[0] base_commit = object_store[base_commit_id] if not isinstance(base_commit, Commit): raise TypeError(f"Expected Commit, got {type(base_commit)}") else: base_commit = None # Perform three-way merge merged_tree, conflicts = three_way_merge( object_store, base_commit, current_commit, other_commit, gitattributes, config, ) # Octopus merge refuses to proceed if there are conflicts if conflicts: return merged_tree, conflicts # Add merged tree to object store object_store.add_object(merged_tree) # Create a temporary commit object with the merged tree for the next iteration # This allows us to continue merging additional commits if i < len(other_commits) - 1: temp_commit = Commit() temp_commit.tree = merged_tree.id # For intermediate merges, we use the same parent as current temp_commit.parents = ( current_commit.parents if current_commit.parents else [current_commit.id] ) # Set minimal required commit fields temp_commit.author = current_commit.author temp_commit.committer = current_commit.committer temp_commit.author_time = current_commit.author_time temp_commit.commit_time = current_commit.commit_time temp_commit.author_timezone = current_commit.author_timezone temp_commit.commit_timezone = current_commit.commit_timezone temp_commit.message = b"Temporary octopus merge commit" object_store.add_object(temp_commit) current_commit = temp_commit return merged_tree, [] dulwich-1.0.0/dulwich/merge_drivers.py000066400000000000000000000174541513301442600200270ustar00rootroot00000000000000# merge_drivers.py -- Merge driver support for dulwich # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Merge driver support for dulwich.""" __all__ = [ "MergeDriver", "MergeDriverRegistry", "ProcessMergeDriver", "get_merge_driver_registry", ] import os import subprocess import tempfile from collections.abc import Callable from typing import Protocol from .config import Config class MergeDriver(Protocol): """Protocol for merge drivers.""" def merge( self, ancestor: bytes, ours: bytes, theirs: bytes, path: str | None = None, marker_size: int = 7, ) -> tuple[bytes, bool]: """Perform a three-way merge. Args: ancestor: Content of the common ancestor version ours: Content of our version theirs: Content of their version path: Optional path of the file being merged marker_size: Size of conflict markers (default 7) Returns: Tuple of (merged content, success flag) If success is False, the content may contain conflict markers """ ... class ProcessMergeDriver: """Merge driver that runs an external process.""" def __init__(self, command: str, name: str = "custom"): """Initialize process merge driver. Args: command: Command to run for merging name: Name of the merge driver """ self.command = command self.name = name def merge( self, ancestor: bytes, ours: bytes, theirs: bytes, path: str | None = None, marker_size: int = 7, ) -> tuple[bytes, bool]: """Perform merge using external process. The command is executed with the following placeholders: - %O: path to ancestor version (base) - %A: path to our version - %B: path to their version - %L: conflict marker size - %P: original path of the file The command should write the merge result to the file at %A. Exit code 0 means successful merge, non-zero means conflicts. """ with tempfile.TemporaryDirectory() as tmpdir: # Write temporary files ancestor_path = os.path.join(tmpdir, "ancestor") ours_path = os.path.join(tmpdir, "ours") theirs_path = os.path.join(tmpdir, "theirs") with open(ancestor_path, "wb") as f: f.write(ancestor) with open(ours_path, "wb") as f: f.write(ours) with open(theirs_path, "wb") as f: f.write(theirs) # Prepare command with placeholders cmd = self.command cmd = cmd.replace("%O", ancestor_path) cmd = cmd.replace("%A", ours_path) cmd = cmd.replace("%B", theirs_path) cmd = cmd.replace("%L", str(marker_size)) if path: cmd = cmd.replace("%P", path) # Execute merge command try: result = subprocess.run( cmd, shell=True, capture_output=True, text=False, ) # Read merged content from ours file with open(ours_path, "rb") as f: merged_content = f.read() # Exit code 0 means clean merge, non-zero means conflicts success = result.returncode == 0 return merged_content, success except subprocess.SubprocessError: # If the command fails completely, return original with conflicts return ours, False class MergeDriverRegistry: """Registry for merge drivers.""" def __init__(self, config: Config | None = None): """Initialize merge driver registry. Args: config: Git configuration object """ self._drivers: dict[str, MergeDriver] = {} self._factories: dict[str, Callable[[], MergeDriver]] = {} self._config = config # Register built-in drivers self._register_builtin_drivers() def _register_builtin_drivers(self) -> None: """Register built-in merge drivers.""" # The "text" driver is the default three-way merge # We don't register it here as it's handled by the default merge code def register_driver(self, name: str, driver: MergeDriver) -> None: """Register a merge driver instance. Args: name: Name of the merge driver driver: Driver instance """ self._drivers[name] = driver def register_factory(self, name: str, factory: Callable[[], MergeDriver]) -> None: """Register a factory function for creating merge drivers. Args: name: Name of the merge driver factory: Factory function that returns a MergeDriver """ self._factories[name] = factory def get_driver(self, name: str) -> MergeDriver | None: """Get a merge driver by name. Args: name: Name of the merge driver Returns: MergeDriver instance or None if not found """ # First check registered drivers if name in self._drivers: return self._drivers[name] # Then check factories if name in self._factories: driver = self._factories[name]() self._drivers[name] = driver return driver # Finally check configuration if self._config: config_driver = self._create_from_config(name) if config_driver is not None: self._drivers[name] = config_driver return config_driver return None def _create_from_config(self, name: str) -> MergeDriver | None: """Create a merge driver from git configuration. Args: name: Name of the merge driver Returns: MergeDriver instance or None if not configured """ if not self._config: return None # Look for merge..driver configuration try: command = self._config.get(("merge", name), "driver") if command: return ProcessMergeDriver(command.decode(), name) except KeyError: pass return None # Global registry instance _merge_driver_registry: MergeDriverRegistry | None = None def get_merge_driver_registry(config: Config | None = None) -> MergeDriverRegistry: """Get the global merge driver registry. Args: config: Git configuration object Returns: MergeDriverRegistry instance """ global _merge_driver_registry if _merge_driver_registry is None: _merge_driver_registry = MergeDriverRegistry(config) elif config is not None: # Update config if provided _merge_driver_registry._config = config return _merge_driver_registry dulwich-1.0.0/dulwich/midx.py000066400000000000000000000524611513301442600161300ustar00rootroot00000000000000# midx.py -- Multi-Pack-Index (MIDX) support # Copyright (C) 2025 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Multi-Pack-Index (MIDX) support. A multi-pack-index (MIDX) provides a single index that covers multiple pack files, enabling fast object lookup across all packs without opening each pack index. The MIDX file format consists of: - A header with signature, version, and hash algorithm - A chunk lookup table - Multiple chunks containing pack names, OID fanout, OID lookup, and object offsets - A trailer with checksum This module provides: - Reading MIDX files - Writing MIDX files - Integration with pack-based object stores Limitations: - Incremental MIDX chains are not yet supported (base_midx_files must be 0) - BTMP (bitmapped packfiles) chunk is not yet implemented - RIDX (reverse index) chunk is not yet implemented Note: Incremental MIDX chains were introduced in Git 2.47 as an experimental feature, where multiple MIDX files can be chained together. The format includes a base_midx_files field in the header and uses a multi-pack-index.d/ directory with a multi-pack-index-chain file. This feature is not yet supported by Dulwich as the specification is still evolving. """ __all__ = [ "CHUNK_BTMP", "CHUNK_LOFF", "CHUNK_OIDF", "CHUNK_OIDL", "CHUNK_OOFF", "CHUNK_PNAM", "CHUNK_RIDX", "HASH_ALGORITHM_SHA1", "HASH_ALGORITHM_SHA256", "MIDX_SIGNATURE", "MIDX_VERSION", "MultiPackIndex", "load_midx", "load_midx_file", "write_midx", ] import os import struct from collections.abc import Iterator from io import UnsupportedOperation from typing import IO, Any try: import mmap except ImportError: has_mmap = False else: has_mmap = True from .file import GitFile, _GitFile from .objects import ObjectID, RawObjectID from .pack import SHA1Writer # MIDX signature MIDX_SIGNATURE = b"MIDX" # MIDX version MIDX_VERSION = 1 # Chunk identifiers (4 bytes each) CHUNK_PNAM = b"PNAM" # Packfile names CHUNK_OIDF = b"OIDF" # OID fanout table CHUNK_OIDL = b"OIDL" # OID lookup table CHUNK_OOFF = b"OOFF" # Object offsets CHUNK_LOFF = b"LOFF" # Large offsets (optional) CHUNK_BTMP = b"BTMP" # Bitmapped packfiles (optional) CHUNK_RIDX = b"RIDX" # Reverse index (optional) # Hash algorithm identifiers HASH_ALGORITHM_SHA1 = 1 HASH_ALGORITHM_SHA256 = 2 class MultiPackIndex: """Multi-pack-index for efficient object lookup across multiple pack files.""" def __init__( self, filename: str | os.PathLike[str], file: IO[bytes] | _GitFile | None = None, contents: bytes | None = None, size: int | None = None, ) -> None: """Initialize a MultiPackIndex. Args: filename: Path to the MIDX file file: Optional file object contents: Optional mmap'd contents size: Optional size of the MIDX file """ self._filename = os.fspath(filename) self._file = file self._size = size # Instance variables that will be set during parsing self.version: int self.hash_algorithm: int self.hash_size: int self.chunk_count: int self.base_midx_files: int self.pack_count: int self.pack_names: list[str] self.object_count: int self._chunks: dict[bytes, int] self._fanout_table: list[int] self._oidl_offset: int self._ooff_offset: int self._loff_offset: int # Load file contents if contents is None: if file is None: with GitFile(filename, "rb") as f: self._contents, self._size = self._load_file_contents(f, size) else: self._contents, self._size = self._load_file_contents(file, size) else: self._contents = contents # Parse header self._parse_header() # Parse chunk lookup table self._parse_chunk_table() def _load_file_contents( self, f: IO[bytes] | _GitFile, size: int | None = None ) -> tuple[bytes | Any, int]: """Load contents from a file, preferring mmap when possible. Args: f: File-like object to load size: Expected size, or None to determine from file Returns: Tuple of (contents, size) """ try: fd = f.fileno() except (UnsupportedOperation, AttributeError): fd = None # Attempt to use mmap if possible if fd is not None: if size is None: size = os.fstat(fd).st_size if has_mmap: try: contents = mmap.mmap(fd, size, access=mmap.ACCESS_READ) except (OSError, ValueError): # Can't mmap - perhaps a socket or invalid file descriptor pass else: return contents, size # Fall back to reading entire file into memory contents_bytes = f.read() size = len(contents_bytes) return contents_bytes, size def _parse_header(self) -> None: """Parse the MIDX header.""" if len(self._contents) < 12: raise ValueError("MIDX file too small") # Check signature signature = self._contents[0:4] if signature != MIDX_SIGNATURE: raise ValueError(f"Invalid MIDX signature: {signature!r}") # Read version self.version = self._contents[4] if self.version != MIDX_VERSION: raise ValueError(f"Unsupported MIDX version: {self.version}") # Read object ID version (hash algorithm) self.hash_algorithm = self._contents[5] if self.hash_algorithm == HASH_ALGORITHM_SHA1: self.hash_size = 20 elif self.hash_algorithm == HASH_ALGORITHM_SHA256: self.hash_size = 32 else: raise ValueError(f"Unknown hash algorithm: {self.hash_algorithm}") # Read chunk count self.chunk_count = self._contents[6] # Read base MIDX files count (currently always 0) self.base_midx_files = self._contents[7] if self.base_midx_files != 0: raise ValueError("Incremental MIDX not yet supported") # Read pack file count (self.pack_count,) = struct.unpack(">L", self._contents[8:12]) def _parse_chunk_table(self) -> None: """Parse the chunk lookup table.""" self._chunks = {} # Chunk table starts at offset 12 offset = 12 # Each chunk entry is 12 bytes (4-byte ID + 8-byte offset) for i in range(self.chunk_count + 1): # +1 for terminator chunk_id = self._contents[offset : offset + 4] (chunk_offset,) = struct.unpack( ">Q", self._contents[offset + 4 : offset + 12] ) if chunk_id == b"\x00\x00\x00\x00": # Terminator entry break self._chunks[chunk_id] = chunk_offset offset += 12 # Parse required chunks self._parse_pnam_chunk() self._parse_oidf_chunk() self._parse_oidl_chunk() self._parse_ooff_chunk() # Parse optional chunks if CHUNK_LOFF in self._chunks: self._parse_loff_chunk() def _parse_pnam_chunk(self) -> None: """Parse the Packfile Names (PNAM) chunk.""" if CHUNK_PNAM not in self._chunks: raise ValueError("Required PNAM chunk not found") offset = self._chunks[CHUNK_PNAM] self.pack_names = [] # Find the end of the PNAM chunk (next chunk or end of chunks section) next_offset = min( (o for o in self._chunks.values() if o > offset), default=len(self._contents), ) # Parse null-terminated pack names current = offset while current < next_offset: # Find the next null terminator null_pos = self._contents.find(b"\x00", current, next_offset) if null_pos == -1: break pack_name = self._contents[current:null_pos].decode("utf-8") if pack_name: # Skip empty strings (padding) self.pack_names.append(pack_name) current = null_pos + 1 def _parse_oidf_chunk(self) -> None: """Parse the OID Fanout (OIDF) chunk.""" if CHUNK_OIDF not in self._chunks: raise ValueError("Required OIDF chunk not found") offset = self._chunks[CHUNK_OIDF] self._fanout_table = [] # Read 256 4-byte entries for i in range(256): (count,) = struct.unpack( ">L", self._contents[offset + i * 4 : offset + i * 4 + 4] ) self._fanout_table.append(count) # Total object count is the last entry self.object_count = self._fanout_table[255] def _parse_oidl_chunk(self) -> None: """Parse the OID Lookup (OIDL) chunk.""" if CHUNK_OIDL not in self._chunks: raise ValueError("Required OIDL chunk not found") self._oidl_offset = self._chunks[CHUNK_OIDL] def _parse_ooff_chunk(self) -> None: """Parse the Object Offsets (OOFF) chunk.""" if CHUNK_OOFF not in self._chunks: raise ValueError("Required OOFF chunk not found") self._ooff_offset = self._chunks[CHUNK_OOFF] def _parse_loff_chunk(self) -> None: """Parse the Large Offsets (LOFF) chunk.""" self._loff_offset = self._chunks[CHUNK_LOFF] def __len__(self) -> int: """Return the number of objects in this MIDX.""" return self.object_count def _get_oid(self, index: int) -> RawObjectID: """Get the object ID at the given index. Args: index: Index of the object Returns: Binary object ID """ if index < 0 or index >= self.object_count: raise IndexError(f"Index {index} out of range") offset = self._oidl_offset + index * self.hash_size return RawObjectID(self._contents[offset : offset + self.hash_size]) def _get_pack_info(self, index: int) -> tuple[int, int]: """Get pack ID and offset for object at the given index. Args: index: Index of the object Returns: Tuple of (pack_id, offset) """ if index < 0 or index >= self.object_count: raise IndexError(f"Index {index} out of range") # Each entry is 8 bytes (4-byte pack ID + 4-byte offset) offset = self._ooff_offset + index * 8 (pack_id,) = struct.unpack(">L", self._contents[offset : offset + 4]) (pack_offset,) = struct.unpack(">L", self._contents[offset + 4 : offset + 8]) # Check if this is a large offset (MSB set) if pack_offset & 0x80000000: # Look up in LOFF chunk if CHUNK_LOFF not in self._chunks: raise ValueError("Large offset found but no LOFF chunk") large_index = pack_offset & 0x7FFFFFFF large_offset_pos = self._loff_offset + large_index * 8 (pack_offset,) = struct.unpack( ">Q", self._contents[large_offset_pos : large_offset_pos + 8] ) return pack_id, pack_offset def object_offset(self, sha: ObjectID | RawObjectID) -> tuple[str, int] | None: """Return the pack name and offset for the given object. Args: sha: Binary SHA-1 or SHA-256 hash Returns: Tuple of (pack_name, offset) or None if not found """ if len(sha) != self.hash_size: raise ValueError( f"SHA size mismatch: expected {self.hash_size}, got {len(sha)}" ) # Use fanout table to narrow search range first_byte = sha[0] start_idx = 0 if first_byte == 0 else self._fanout_table[first_byte - 1] end_idx = self._fanout_table[first_byte] # Binary search within the range while start_idx < end_idx: mid = (start_idx + end_idx) // 2 mid_sha = self._get_oid(mid) if mid_sha == sha: # Found it! pack_id, offset = self._get_pack_info(mid) return self.pack_names[pack_id], offset elif mid_sha < sha: start_idx = mid + 1 else: end_idx = mid return None def __contains__(self, sha: ObjectID | RawObjectID) -> bool: """Check if the given object SHA is in this MIDX. Args: sha: Binary SHA hash Returns: True if the object is in this MIDX """ return self.object_offset(sha) is not None def iterentries(self) -> Iterator[tuple[RawObjectID, str, int]]: """Iterate over all entries in this MIDX. Yields: Tuples of (sha, pack_name, offset) """ for i in range(self.object_count): sha = self._get_oid(i) pack_id, offset = self._get_pack_info(i) pack_name = self.pack_names[pack_id] yield sha, pack_name, offset def close(self) -> None: """Close the MIDX file and release mmap resources.""" # Close mmap'd contents first if it's an mmap object if self._contents is not None and has_mmap: if isinstance(self._contents, mmap.mmap): self._contents.close() self._contents = None # Close file handle if self._file is not None: self._file.close() self._file = None def load_midx(path: str | os.PathLike[str]) -> MultiPackIndex: """Load a multi-pack-index file by path. Args: path: Path to the MIDX file Returns: A MultiPackIndex loaded from the given path """ with GitFile(path, "rb") as f: return load_midx_file(path, f) def load_midx_file( path: str | os.PathLike[str], f: IO[bytes] | _GitFile ) -> MultiPackIndex: """Load a multi-pack-index from a file-like object. Args: path: Path for the MIDX file f: File-like object Returns: A MultiPackIndex loaded from the given file """ return MultiPackIndex(path, file=f) def write_midx( f: IO[bytes], pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]], hash_algorithm: int = HASH_ALGORITHM_SHA1, ) -> bytes: """Write a multi-pack-index file. Args: f: File-like object to write to pack_index_entries: List of (pack_name, entries) tuples where entries are (sha, offset, crc32) tuples, sorted by SHA hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256) Returns: SHA-1 checksum of the written MIDX file """ if hash_algorithm == HASH_ALGORITHM_SHA1: hash_size = 20 elif hash_algorithm == HASH_ALGORITHM_SHA256: hash_size = 32 else: raise ValueError(f"Unknown hash algorithm: {hash_algorithm}") # Wrap file in SHA1Writer to compute checksum writer = SHA1Writer(f) # Sort pack entries by pack name (required by Git) pack_index_entries_sorted = sorted(pack_index_entries, key=lambda x: x[0]) # Collect all objects from all packs all_objects: list[tuple[RawObjectID, int, int]] = [] # (sha, pack_id, offset) pack_names: list[str] = [] for pack_id, (pack_name, entries) in enumerate(pack_index_entries_sorted): pack_names.append(pack_name) for sha, offset, _crc32 in entries: all_objects.append((sha, pack_id, offset)) # Sort all objects by SHA all_objects.sort(key=lambda x: x[0]) # Calculate offsets for chunks num_packs = len(pack_names) num_objects = len(all_objects) # Header: 12 bytes header_size = 12 # Chunk count: PNAM, OIDF, OIDL, OOFF, and optionally LOFF # We'll determine if LOFF is needed later chunk_count = 4 # PNAM, OIDF, OIDL, OOFF # Check if we need LOFF chunk (for offsets >= 2^31) need_loff = any(offset >= 2**31 for _sha, _pack_id, offset in all_objects) if need_loff: chunk_count += 1 # Chunk table: (chunk_count + 1) * 12 bytes (including terminator) chunk_table_size = (chunk_count + 1) * 12 # Calculate chunk offsets current_offset = header_size + chunk_table_size # PNAM chunk: pack names as null-terminated strings, padded to 4-byte boundary pnam_data = b"".join(name.encode("utf-8") + b"\x00" for name in pack_names) # Pad to 4-byte boundary pnam_padding = (4 - len(pnam_data) % 4) % 4 pnam_data += b"\x00" * pnam_padding pnam_offset = current_offset current_offset += len(pnam_data) # OIDF chunk: 256 * 4 bytes oidf_offset = current_offset oidf_size = 256 * 4 current_offset += oidf_size # OIDL chunk: num_objects * hash_size bytes oidl_offset = current_offset oidl_size = num_objects * hash_size current_offset += oidl_size # OOFF chunk: num_objects * 8 bytes (4 for pack_id + 4 for offset) ooff_offset = current_offset ooff_size = num_objects * 8 current_offset += ooff_size # LOFF chunk (if needed): variable size # We'll calculate the exact size when we know how many large offsets we have loff_offset = current_offset if need_loff else 0 large_offsets: list[int] = [] # Calculate trailer offset (where checksum starts) # We need to pre-calculate large offset count for accurate trailer offset if need_loff: # Count large offsets large_offset_count = sum(1 for _, _, offset in all_objects if offset >= 2**31) loff_size = large_offset_count * 8 trailer_offset = current_offset + loff_size else: trailer_offset = current_offset # Write header writer.write(MIDX_SIGNATURE) # 4 bytes: signature writer.write(bytes([MIDX_VERSION])) # 1 byte: version writer.write(bytes([hash_algorithm])) # 1 byte: hash algorithm writer.write(bytes([chunk_count])) # 1 byte: chunk count writer.write(bytes([0])) # 1 byte: base MIDX files (always 0) writer.write(struct.pack(">L", num_packs)) # 4 bytes: pack count # Write chunk table chunk_table = [ (CHUNK_PNAM, pnam_offset), (CHUNK_OIDF, oidf_offset), (CHUNK_OIDL, oidl_offset), (CHUNK_OOFF, ooff_offset), ] if need_loff: chunk_table.append((CHUNK_LOFF, loff_offset)) for chunk_id, chunk_offset in chunk_table: writer.write(chunk_id) # 4 bytes writer.write(struct.pack(">Q", chunk_offset)) # 8 bytes # Write terminator (points to where trailer/checksum starts) writer.write(b"\x00\x00\x00\x00") # 4 bytes writer.write(struct.pack(">Q", trailer_offset)) # 8 bytes # Write PNAM chunk writer.write(pnam_data) # Write OIDF chunk (fanout table) fanout: list[int] = [0] * 256 for sha, _pack_id, _offset in all_objects: first_byte = sha[0] fanout[first_byte] += 1 # Convert counts to cumulative cumulative = 0 for i in range(256): cumulative += fanout[i] writer.write(struct.pack(">L", cumulative)) # Write OIDL chunk (object IDs) for sha, _pack_id, _offset in all_objects: writer.write(sha) # Write OOFF chunk (pack ID and offset for each object) for _sha, pack_id, offset in all_objects: writer.write(struct.pack(">L", pack_id)) if offset >= 2**31: # Use large offset table large_offset_index = len(large_offsets) large_offsets.append(offset) # Set MSB to indicate large offset writer.write(struct.pack(">L", 0x80000000 | large_offset_index)) else: writer.write(struct.pack(">L", offset)) # Write LOFF chunk if needed if need_loff: for large_offset in large_offsets: writer.write(struct.pack(">Q", large_offset)) # Write checksum return writer.write_sha() def write_midx_file( path: str | os.PathLike[str], pack_index_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]], hash_algorithm: int = HASH_ALGORITHM_SHA1, ) -> bytes: """Write a multi-pack-index file to disk. Args: path: Path where to write the MIDX file pack_index_entries: List of (pack_name, entries) tuples where entries are (sha, offset, crc32) tuples, sorted by SHA hash_algorithm: Hash algorithm to use (1=SHA-1, 2=SHA-256) Returns: SHA-1 checksum of the written MIDX file """ with GitFile(path, "wb") as f: return write_midx(f, pack_index_entries, hash_algorithm) # TODO: Add support for incremental MIDX chains # TODO: Add support for BTMP and RIDX chunks for bitmap integration dulwich-1.0.0/dulwich/notes.py000066400000000000000000000752221513301442600163170ustar00rootroot00000000000000# notes.py -- Git notes handling # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git notes handling.""" __all__ = [ "DEFAULT_NOTES_REF", "NOTES_REF_PREFIX", "Notes", "NotesTree", "create_notes_tree", "get_note_fanout_level", "get_note_path", "split_path_for_fanout", ] import stat from collections.abc import Iterator, Sequence from typing import TYPE_CHECKING from .objects import Blob, ObjectID, Tree from .refs import Ref if TYPE_CHECKING: from .config import StackedConfig from .object_store import BaseObjectStore from .refs import RefsContainer NOTES_REF_PREFIX = b"refs/notes/" DEFAULT_NOTES_REF = NOTES_REF_PREFIX + b"commits" def get_note_fanout_level(tree: Tree, object_store: "BaseObjectStore") -> int: """Determine the fanout level for a note tree. Git uses a fanout directory structure for performance with large numbers of notes. The fanout level determines how many levels of subdirectories are used. Args: tree: The notes tree to analyze object_store: Object store to retrieve subtrees Returns: Fanout level (0 for no fanout, 1 or 2 for fanout) """ # Count the total number of notes in the tree recursively def count_notes(tree: Tree, level: int = 0) -> int: """Count notes in a tree recursively. Args: tree: Tree to count notes in level: Current recursion level Returns: Total number of notes """ count = 0 for name, mode, sha in tree.items(): assert mode is not None if stat.S_ISREG(mode): count += 1 elif stat.S_ISDIR(mode) and level < 2: # Only recurse 2 levels deep assert sha is not None try: subtree = object_store[sha] assert isinstance(subtree, Tree) count += count_notes(subtree, level + 1) except KeyError: pass return count note_count = count_notes(tree) # Use fanout based on number of notes # Git typically starts using fanout around 256 notes if note_count < 256: return 0 elif note_count < 65536: # 256^2 return 1 else: return 2 def split_path_for_fanout(hexsha: bytes, fanout_level: int) -> tuple[bytes, ...]: """Split a hex SHA into path components based on fanout level. Args: hexsha: Hex SHA of the object fanout_level: Number of directory levels for fanout Returns: Tuple of path components """ if fanout_level == 0: return (hexsha,) components = [] for i in range(fanout_level): components.append(hexsha[i * 2 : (i + 1) * 2]) components.append(hexsha[fanout_level * 2 :]) return tuple(components) def get_note_path(object_sha: bytes, fanout_level: int = 0) -> bytes: """Get the path within the notes tree for a given object. Args: object_sha: Hex SHA of the object to get notes for fanout_level: Fanout level to use Returns: Path within the notes tree """ components = split_path_for_fanout(object_sha, fanout_level) return b"/".join(components) class NotesTree: """Represents a Git notes tree.""" def __init__(self, tree: Tree, object_store: "BaseObjectStore") -> None: """Initialize a notes tree. Args: tree: The tree object containing notes object_store: Object store to retrieve note contents from """ self._tree = tree self._object_store = object_store self._fanout_level = self._detect_fanout_level() def _detect_fanout_level(self) -> int: """Detect the fanout level used in this notes tree. Returns: Detected fanout level """ if not self._tree.items(): return 0 # Check for presence of both files and directories has_files = False has_dirs = False dir_names = [] for name, mode, sha in self._tree.items(): assert name is not None assert mode is not None if stat.S_ISDIR(mode): has_dirs = True dir_names.append(name) elif stat.S_ISREG(mode): has_files = True # If we have files at the root level, check if they're full SHA names if has_files and not has_dirs: # Check if any file names are full hex strings (40 for SHA-1, 64 for SHA-256) hex_length = self._object_store.object_format.hex_length for name, mode, sha in self._tree.items(): assert name is not None assert mode is not None if stat.S_ISREG(mode) and len(name) == hex_length: try: int(name, 16) # Verify it's a valid hex string return 0 # No fanout except ValueError: pass # Check if all directories are 2-character hex names if has_dirs and dir_names: all_two_char_hex = all( len(name) == 2 and all(c in b"0123456789abcdef" for c in name) for name in dir_names ) if all_two_char_hex: # Check a sample directory to determine if it's level 1 or 2 sample_dir_name = dir_names[0] try: _sample_mode, sample_sha = self._tree[sample_dir_name] sample_tree = self._object_store[sample_sha] assert isinstance(sample_tree, Tree) # Check if this subtree also has 2-char hex directories sub_has_dirs = False for sub_name, sub_mode, sub_sha in sample_tree.items(): assert sub_name is not None assert sub_mode is not None if stat.S_ISDIR(sub_mode) and len(sub_name) == 2: try: int(sub_name, 16) sub_has_dirs = True break except ValueError: pass return 2 if sub_has_dirs else 1 except KeyError: return 1 # Assume level 1 if we can't inspect return 0 def _reorganize_tree(self, new_fanout_level: int) -> None: """Reorganize the notes tree to use a different fanout level. Args: new_fanout_level: The desired fanout level """ if new_fanout_level == self._fanout_level: return # Collect all existing notes notes = [] for object_sha, note_sha in self.list_notes(): note_obj = self._object_store[note_sha] if isinstance(note_obj, Blob): notes.append((object_sha, note_obj.data)) # Create new empty tree new_tree = Tree() self._object_store.add_object(new_tree) self._tree = new_tree self._fanout_level = new_fanout_level # Re-add all notes with new fanout structure using set_note # Temporarily set fanout back to avoid recursion for object_sha, note_content in notes: # Use the internal tree update logic without checking fanout again note_blob = Blob.from_string(note_content) self._object_store.add_object(note_blob) path = get_note_path(object_sha, new_fanout_level) components = path.split(b"/") # Build new tree structure def update_tree( tree: Tree, components: Sequence[bytes], blob_sha: ObjectID ) -> Tree: """Update tree with new note entry. Args: tree: Tree to update components: Path components blob_sha: SHA of the note blob Returns: Updated tree """ if len(components) == 1: # Leaf level - add the note blob new_tree = Tree() for name, mode, sha in tree.items(): if name != components[0]: assert name is not None assert mode is not None assert sha is not None new_tree.add(name, mode, sha) new_tree.add(components[0], stat.S_IFREG | 0o644, blob_sha) return new_tree else: # Directory level new_tree = Tree() found = False for name, mode, sha in tree.items(): if name == components[0]: # Update this subtree assert mode is not None and sha is not None if stat.S_ISDIR(mode): subtree = self._object_store[sha] assert isinstance(subtree, Tree) else: # If not a directory, we need to replace it subtree = Tree() new_subtree = update_tree(subtree, components[1:], blob_sha) self._object_store.add_object(new_subtree) new_tree.add(name, stat.S_IFDIR, new_subtree.id) found = True else: assert ( name is not None and mode is not None and sha is not None ) new_tree.add(name, mode, sha) if not found: # Create new subtree path subtree = Tree() new_subtree = update_tree(subtree, components[1:], blob_sha) self._object_store.add_object(new_subtree) new_tree.add(components[0], stat.S_IFDIR, new_subtree.id) return new_tree self._tree = update_tree(self._tree, components, note_blob.id) self._object_store.add_object(self._tree) def _update_tree_entry( self, tree: Tree, name: bytes, mode: int, sha: ObjectID ) -> Tree: """Update a tree entry and return the updated tree. Args: tree: The tree to update name: Name of the entry mode: File mode sha: SHA of the object Returns: The updated tree """ new_tree = Tree() for existing_name, existing_mode, existing_sha in tree.items(): if existing_name != name: assert ( existing_name is not None and existing_mode is not None and existing_sha is not None ) new_tree.add(existing_name, existing_mode, existing_sha) new_tree.add(name, mode, sha) self._object_store.add_object(new_tree) # Update the tree reference if tree is self._tree: self._tree = new_tree return new_tree def _get_note_sha(self, object_sha: bytes) -> ObjectID | None: """Get the SHA of the note blob for an object. Args: object_sha: SHA of the object to get notes for Returns: SHA of the note blob, or None if no note exists """ path = get_note_path(object_sha, self._fanout_level) components = path.split(b"/") current_tree = self._tree for component in components[:-1]: try: mode, sha = current_tree[component] if not stat.S_ISDIR(mode): # Not a directory return None obj = self._object_store[sha] assert isinstance(obj, Tree) current_tree = obj except KeyError: return None try: mode, sha = current_tree[components[-1]] if not stat.S_ISREG(mode): # Not a regular file return None return sha except KeyError: return None def get_note(self, object_sha: bytes) -> bytes | None: """Get the note content for an object. Args: object_sha: SHA of the object to get notes for Returns: Note content as bytes, or None if no note exists """ note_sha = self._get_note_sha(object_sha) if note_sha is None: return None try: note_obj = self._object_store[note_sha] if not isinstance(note_obj, Blob): return None data: bytes = note_obj.data return data except KeyError: return None def set_note(self, object_sha: bytes, note_content: bytes) -> Tree: """Set or update a note for an object. Args: object_sha: SHA of the object to annotate note_content: Content of the note Returns: New tree object with the note added/updated """ # Create note blob note_blob = Blob.from_string(note_content) self._object_store.add_object(note_blob) # Check if we need to reorganize the tree for better fanout desired_fanout = get_note_fanout_level(self._tree, self._object_store) if desired_fanout != self._fanout_level: self._reorganize_tree(desired_fanout) # Get path components path = get_note_path(object_sha, self._fanout_level) components = path.split(b"/") # Build new tree structure def update_tree( tree: Tree, components: Sequence[bytes], blob_sha: ObjectID ) -> Tree: """Update tree with new note entry. Args: tree: Tree to update components: Path components blob_sha: SHA of the note blob Returns: Updated tree """ if len(components) == 1: # Leaf level - add the note blob new_tree = Tree() for name, mode, sha in tree.items(): if name != components[0]: assert name is not None and mode is not None and sha is not None new_tree.add(name, mode, sha) new_tree.add(components[0], stat.S_IFREG | 0o644, blob_sha) return new_tree else: # Directory level new_tree = Tree() found = False for name, mode, sha in tree.items(): if name == components[0]: # Update this subtree assert mode is not None and sha is not None if stat.S_ISDIR(mode): subtree = self._object_store[sha] assert isinstance(subtree, Tree) else: # If not a directory, we need to replace it subtree = Tree() new_subtree = update_tree(subtree, components[1:], blob_sha) self._object_store.add_object(new_subtree) new_tree.add(name, stat.S_IFDIR, new_subtree.id) found = True else: assert name is not None and mode is not None and sha is not None new_tree.add(name, mode, sha) if not found: # Create new subtree path subtree = Tree() new_subtree = update_tree(subtree, components[1:], blob_sha) self._object_store.add_object(new_subtree) new_tree.add(components[0], stat.S_IFDIR, new_subtree.id) return new_tree new_tree = update_tree(self._tree, components, note_blob.id) self._object_store.add_object(new_tree) self._tree = new_tree self._fanout_level = self._detect_fanout_level() return new_tree def remove_note(self, object_sha: bytes) -> Tree | None: """Remove a note for an object. Args: object_sha: SHA of the object to remove notes from Returns: New tree object with the note removed, or None if no note existed """ if self._get_note_sha(object_sha) is None: return None # Get path components path = get_note_path(object_sha, self._fanout_level) components = path.split(b"/") # Build new tree structure without the note def remove_from_tree(tree: Tree, components: Sequence[bytes]) -> Tree | None: """Remove note entry from tree. Args: tree: Tree to remove from components: Path components Returns: Updated tree or None if empty """ if len(components) == 1: # Leaf level - remove the note new_tree = Tree() found = False for name, mode, sha in tree.items(): if name != components[0]: assert name is not None and mode is not None and sha is not None new_tree.add(name, mode, sha) else: found = True if not found: return None # Return None if tree is now empty return new_tree if len(new_tree) > 0 else None else: # Directory level new_tree = Tree() modified = False for name, mode, sha in tree.items(): assert name is not None and mode is not None and sha is not None if name == components[0] and stat.S_ISDIR(mode): # Update this subtree subtree = self._object_store[sha] assert isinstance(subtree, Tree) new_subtree = remove_from_tree(subtree, components[1:]) if new_subtree is not None: self._object_store.add_object(new_subtree) new_tree.add(name, stat.S_IFDIR, new_subtree.id) modified = True else: new_tree.add(name, mode, sha) if not modified: return None # Return None if tree is now empty return new_tree if len(new_tree) > 0 else None new_tree = remove_from_tree(self._tree, components) if new_tree is None: new_tree = Tree() # Empty tree self._object_store.add_object(new_tree) self._tree = new_tree self._fanout_level = self._detect_fanout_level() return new_tree def list_notes(self) -> Iterator[tuple[ObjectID, ObjectID]]: """List all notes in this tree. Yields: Tuples of (object_sha, note_sha) """ def walk_tree( tree: Tree, prefix: bytes = b"" ) -> Iterator[tuple[ObjectID, ObjectID]]: """Walk the notes tree recursively. Args: tree: Tree to walk prefix: Path prefix for current level Yields: Tuples of (object_sha, note_sha) """ for name, mode, sha in tree.items(): assert name is not None and mode is not None and sha is not None if stat.S_ISDIR(mode): # Directory subtree = self._object_store[sha] assert isinstance(subtree, Tree) yield from walk_tree(subtree, prefix + name) elif stat.S_ISREG(mode): # File # Reconstruct the full hex SHA from the path full_hex = prefix + name yield (ObjectID(full_hex), sha) yield from walk_tree(self._tree) def create_notes_tree(object_store: "BaseObjectStore") -> Tree: """Create an empty notes tree. Args: object_store: Object store to add the tree to Returns: Empty tree object """ tree = Tree() object_store.add_object(tree) return tree class Notes: """High-level interface for Git notes operations.""" def __init__( self, object_store: "BaseObjectStore", refs_container: "RefsContainer" ) -> None: """Initialize Notes. Args: object_store: Object store to read/write objects refs_container: Refs container to read/write refs """ self._object_store = object_store self._refs = refs_container def get_notes_ref( self, notes_ref: bytes | None = None, config: "StackedConfig | None" = None, ) -> Ref: """Get the notes reference to use. Args: notes_ref: The notes ref to use, or None to use the default config: Config to read notes.displayRef from Returns: The notes reference name """ if notes_ref is None: if config is not None: notes_ref = config.get((b"notes",), b"displayRef") if notes_ref is None: notes_ref = DEFAULT_NOTES_REF return Ref(notes_ref) def get_note( self, object_sha: bytes, notes_ref: bytes | None = None, config: "StackedConfig | None" = None, ) -> bytes | None: """Get the note for an object. Args: object_sha: SHA of the object to get notes for notes_ref: The notes ref to use, or None to use the default config: Config to read notes.displayRef from Returns: The note content as bytes, or None if no note exists """ notes_ref = self.get_notes_ref(notes_ref, config) try: notes_commit_sha = self._refs[notes_ref] except KeyError: return None # Get the commit object notes_obj = self._object_store[notes_commit_sha] # If it's a commit, get the tree from it from .objects import Commit if isinstance(notes_obj, Commit): notes_tree = self._object_store[notes_obj.tree] else: # If it's directly a tree (shouldn't happen in normal usage) notes_tree = notes_obj if not isinstance(notes_tree, Tree): return None notes_tree_obj = NotesTree(notes_tree, self._object_store) return notes_tree_obj.get_note(object_sha) def set_note( self, object_sha: bytes, note_content: bytes, notes_ref: bytes | None = None, author: bytes | None = None, committer: bytes | None = None, message: bytes | None = None, config: "StackedConfig | None" = None, ) -> bytes: """Set or update a note for an object. Args: object_sha: SHA of the object to annotate note_content: Content of the note notes_ref: The notes ref to use, or None to use the default author: Author identity (defaults to committer) committer: Committer identity (defaults to config) message: Commit message for the notes update config: Config to read user identity and notes.displayRef from Returns: SHA of the new notes commit """ import time from .objects import Commit from .repo import get_user_identity notes_ref = self.get_notes_ref(notes_ref, config) # Get current notes tree try: notes_commit_sha = self._refs[notes_ref] notes_obj = self._object_store[notes_commit_sha] # If it's a commit, get the tree from it if isinstance(notes_obj, Commit): notes_tree = self._object_store[notes_obj.tree] else: # If it's directly a tree (shouldn't happen in normal usage) notes_tree = notes_obj if not isinstance(notes_tree, Tree): notes_tree = create_notes_tree(self._object_store) except KeyError: notes_tree = create_notes_tree(self._object_store) # Update notes tree notes_tree_obj = NotesTree(notes_tree, self._object_store) new_tree = notes_tree_obj.set_note(object_sha, note_content) # Create commit if committer is None and config is not None: committer = get_user_identity(config, kind="COMMITTER") if committer is None: committer = b"Git User " if author is None: author = committer if message is None: message = b"Notes added by 'git notes add'" commit = Commit() commit.tree = new_tree.id commit.author = author commit.committer = committer commit.commit_time = commit.author_time = int(time.time()) commit.commit_timezone = commit.author_timezone = 0 commit.encoding = b"UTF-8" commit.message = message # Set parent to previous notes commit if exists try: parent_sha = self._refs[notes_ref] parent = self._object_store[parent_sha] if isinstance(parent, Commit): commit.parents = [parent_sha] except KeyError: commit.parents = [] self._object_store.add_object(commit) self._refs[notes_ref] = commit.id return commit.id def remove_note( self, object_sha: bytes, notes_ref: bytes | None = None, author: bytes | None = None, committer: bytes | None = None, message: bytes | None = None, config: "StackedConfig | None" = None, ) -> bytes | None: """Remove a note for an object. Args: object_sha: SHA of the object to remove notes from notes_ref: The notes ref to use, or None to use the default author: Author identity (defaults to committer) committer: Committer identity (defaults to config) message: Commit message for the notes removal config: Config to read user identity and notes.displayRef from Returns: SHA of the new notes commit, or None if no note existed """ import time from .objects import Commit from .repo import get_user_identity notes_ref = self.get_notes_ref(notes_ref, config) # Get current notes tree try: notes_commit_sha = self._refs[notes_ref] notes_obj = self._object_store[notes_commit_sha] # If it's a commit, get the tree from it if isinstance(notes_obj, Commit): notes_tree = self._object_store[notes_obj.tree] else: # If it's directly a tree (shouldn't happen in normal usage) notes_tree = notes_obj if not isinstance(notes_tree, Tree): return None except KeyError: return None # Remove from notes tree notes_tree_obj = NotesTree(notes_tree, self._object_store) new_tree = notes_tree_obj.remove_note(object_sha) if new_tree is None: return None # Create commit if committer is None and config is not None: committer = get_user_identity(config, kind="COMMITTER") if committer is None: committer = b"Git User " if author is None: author = committer if message is None: message = b"Notes removed by 'git notes remove'" commit = Commit() commit.tree = new_tree.id commit.author = author commit.committer = committer commit.commit_time = commit.author_time = int(time.time()) commit.commit_timezone = commit.author_timezone = 0 commit.encoding = b"UTF-8" commit.message = message # Set parent to previous notes commit parent_sha = self._refs[notes_ref] parent = self._object_store[parent_sha] if isinstance(parent, Commit): commit.parents = [parent_sha] self._object_store.add_object(commit) self._refs[notes_ref] = commit.id return commit.id def list_notes( self, notes_ref: bytes | None = None, config: "StackedConfig | None" = None, ) -> list[tuple[ObjectID, bytes]]: """List all notes in a notes ref. Args: notes_ref: The notes ref to use, or None to use the default config: Config to read notes.displayRef from Returns: List of tuples of (object_sha, note_content) """ notes_ref = self.get_notes_ref(notes_ref, config) try: notes_commit_sha = self._refs[notes_ref] except KeyError: return [] # Get the commit object from .objects import Commit notes_obj = self._object_store[notes_commit_sha] # If it's a commit, get the tree from it if isinstance(notes_obj, Commit): notes_tree = self._object_store[notes_obj.tree] else: # If it's directly a tree (shouldn't happen in normal usage) notes_tree = notes_obj if not isinstance(notes_tree, Tree): return [] notes_tree_obj = NotesTree(notes_tree, self._object_store) result: list[tuple[ObjectID, bytes]] = [] for object_sha, note_sha in notes_tree_obj.list_notes(): note_obj = self._object_store[note_sha] if isinstance(note_obj, Blob): result.append((object_sha, note_obj.data)) return result dulwich-1.0.0/dulwich/object_filters.py000066400000000000000000000516361513301442600201700ustar00rootroot00000000000000# object_filters.py -- Object filtering for partial clone and similar operations # Copyright (C) 2024 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Object filtering for Git partial clone and pack generation. This module implements Git's object filter specifications for partial clone, as documented in: https://git-scm.com/docs/rev-list-options#Documentation/rev-list-options.txt---filterltfilter-specgt Filter specifications control which objects are included when generating packs, enabling partial clone (downloading only needed objects) and similar operations. Supported filter specs: - blob:none - Exclude all blobs - blob:limit=[kmg] - Exclude blobs larger than n bytes/KB/MB/GB - tree: - Exclude trees beyond specified depth - sparse:oid= - Use sparse specification from object - combine:++... - Combine multiple filters """ __all__ = [ "BlobLimitFilter", "BlobNoneFilter", "CombineFilter", "FilterSpec", "SparseOidFilter", "TreeDepthFilter", "filter_pack_objects", "filter_pack_objects_with_paths", "parse_filter_spec", ] from abc import ABC, abstractmethod from typing import TYPE_CHECKING from .objects import S_ISGITLINK, Blob, Commit, ObjectID, Tag, Tree, valid_hexsha if TYPE_CHECKING: from collections.abc import Callable from .object_store import BaseObjectStore from .objects import ObjectID class FilterSpec(ABC): """Base class for all filter specifications.""" @abstractmethod def should_include_blob(self, blob_size: int) -> bool: """Determine if a blob of given size should be included. Args: blob_size: Size of the blob in bytes Returns: True if the blob should be included, False otherwise """ ... @abstractmethod def should_include_tree(self, depth: int) -> bool: """Determine if a tree at given depth should be included. Args: depth: Depth of the tree (0 = root) Returns: True if the tree should be included, False otherwise """ ... @abstractmethod def to_spec_string(self) -> str: """Convert filter spec back to string format. Returns: Filter specification string (e.g., ``blob:none``, ``blob:limit=1m``) """ ... class BlobNoneFilter(FilterSpec): """Filter that excludes all blobs.""" def should_include_blob(self, blob_size: int) -> bool: """Exclude all blobs.""" return False def should_include_tree(self, depth: int) -> bool: """Include all trees.""" return True def to_spec_string(self) -> str: """Return 'blob:none'.""" return "blob:none" def __repr__(self) -> str: """Return string representation of the filter.""" return "BlobNoneFilter()" class BlobLimitFilter(FilterSpec): """Filter that excludes blobs larger than a specified size.""" def __init__(self, limit: int) -> None: """Initialize blob limit filter. Args: limit: Maximum blob size in bytes """ self.limit = limit def should_include_blob(self, blob_size: int) -> bool: """Include only blobs smaller than or equal to the limit.""" return blob_size <= self.limit def should_include_tree(self, depth: int) -> bool: """Include all trees.""" return True def to_spec_string(self) -> str: """Return 'blob:limit=' with appropriate unit.""" size = self.limit if size >= 1024 * 1024 * 1024 and size % (1024 * 1024 * 1024) == 0: return f"blob:limit={size // (1024 * 1024 * 1024)}g" elif size >= 1024 * 1024 and size % (1024 * 1024) == 0: return f"blob:limit={size // (1024 * 1024)}m" elif size >= 1024 and size % 1024 == 0: return f"blob:limit={size // 1024}k" else: return f"blob:limit={size}" def __repr__(self) -> str: """Return string representation of the filter.""" return f"BlobLimitFilter(limit={self.limit})" class TreeDepthFilter(FilterSpec): """Filter that excludes trees beyond a specified depth.""" def __init__(self, max_depth: int) -> None: """Initialize tree depth filter. Args: max_depth: Maximum tree depth (0 = only root tree) """ self.max_depth = max_depth def should_include_blob(self, blob_size: int) -> bool: """Include all blobs.""" return True def should_include_tree(self, depth: int) -> bool: """Include only trees up to max_depth.""" return depth <= self.max_depth def to_spec_string(self) -> str: """Return 'tree:'.""" return f"tree:{self.max_depth}" def __repr__(self) -> str: """Return string representation of the filter.""" return f"TreeDepthFilter(max_depth={self.max_depth})" class SparseOidFilter(FilterSpec): """Filter that uses a sparse specification from an object. This filter reads sparse-checkout patterns from a blob object and uses them to determine which paths should be included in the partial clone. """ def __init__( self, oid: "ObjectID", object_store: "BaseObjectStore | None" = None ) -> None: """Initialize sparse OID filter. Args: oid: Object ID of the sparse specification blob object_store: Optional object store to load the sparse patterns from """ self.oid = oid self._patterns: list[tuple[str, bool, bool, bool]] | None = None self._object_store = object_store def _load_patterns(self) -> None: """Load and parse sparse patterns from the blob.""" if self._patterns is not None: return if self._object_store is None: raise ValueError("Cannot load sparse patterns without an object store") from .sparse_patterns import parse_sparse_patterns try: obj = self._object_store[self.oid] except KeyError: raise ValueError( f"Sparse specification blob {self.oid.hex() if isinstance(self.oid, bytes) else self.oid} not found" ) if not isinstance(obj, Blob): raise ValueError( f"Sparse specification {self.oid.hex() if isinstance(self.oid, bytes) else self.oid} is not a blob" ) # Parse the blob content as sparse patterns lines = obj.data.decode("utf-8").splitlines() self._patterns = parse_sparse_patterns(lines) def should_include_path(self, path: str) -> bool: """Determine if a path should be included based on sparse patterns. Args: path: Path to check (e.g., 'src/file.py') Returns: True if the path matches the sparse patterns, False otherwise """ self._load_patterns() from .sparse_patterns import match_sparse_patterns # Determine if path is a directory based on whether it ends with '/' path_is_dir = path.endswith("/") path_str = path.rstrip("/") assert self._patterns is not None # _load_patterns ensures this return match_sparse_patterns(path_str, self._patterns, path_is_dir=path_is_dir) def should_include_blob(self, blob_size: int) -> bool: """Include all blobs (sparse filtering is path-based, not size-based).""" return True def should_include_tree(self, depth: int) -> bool: """Include all trees (sparse filtering is path-based).""" return True def to_spec_string(self) -> str: """Return 'sparse:oid='.""" return f"sparse:oid={self.oid.decode('ascii') if isinstance(self.oid, bytes) else self.oid}" def __repr__(self) -> str: """Return string representation of the filter.""" oid_str = self.oid.decode("ascii") if isinstance(self.oid, bytes) else self.oid return f"SparseOidFilter(oid={oid_str!r})" class CombineFilter(FilterSpec): """Filter that combines multiple filters with AND logic.""" def __init__(self, filters: list[FilterSpec]) -> None: """Initialize combine filter. Args: filters: List of filters to combine """ self.filters = filters def should_include_blob(self, blob_size: int) -> bool: """Include blob only if all filters agree.""" return all(f.should_include_blob(blob_size) for f in self.filters) def should_include_tree(self, depth: int) -> bool: """Include tree only if all filters agree.""" return all(f.should_include_tree(depth) for f in self.filters) def to_spec_string(self) -> str: """Return 'combine:++...'.""" return "combine:" + "+".join(f.to_spec_string() for f in self.filters) def __repr__(self) -> str: """Return string representation of the filter.""" return f"CombineFilter(filters={self.filters!r})" def _parse_size(size_str: str) -> int: """Parse a size specification like '100', '10k', '5m', '1g'. Args: size_str: Size string with optional unit suffix Returns: Size in bytes Raises: ValueError: If size_str is not a valid size specification """ size_str = size_str.lower() multipliers = {"k": 1024, "m": 1024 * 1024, "g": 1024 * 1024 * 1024} if size_str[-1] in multipliers: try: value = int(size_str[:-1]) return value * multipliers[size_str[-1]] except ValueError: raise ValueError(f"Invalid size specification: {size_str}") else: try: return int(size_str) except ValueError: raise ValueError(f"Invalid size specification: {size_str}") def parse_filter_spec( spec: str | bytes, object_store: "BaseObjectStore | None" = None ) -> FilterSpec: """Parse a filter specification string. Args: spec: Filter specification (e.g., 'blob:none', 'blob:limit=1m') object_store: Optional object store for loading sparse specifications Returns: Parsed FilterSpec object Raises: ValueError: If spec is not a valid filter specification Examples: >>> parse_filter_spec("blob:none") BlobNoneFilter() >>> parse_filter_spec("blob:limit=1m") BlobLimitFilter(limit=1048576) >>> parse_filter_spec("tree:0") TreeDepthFilter(max_depth=0) """ if isinstance(spec, bytes): try: spec = spec.decode("utf-8") except UnicodeDecodeError as e: raise ValueError(f"Filter specification must be valid UTF-8: {e}") spec = spec.strip() if not spec: raise ValueError("Filter specification cannot be empty") if spec == "blob:none": return BlobNoneFilter() elif spec.startswith("blob:limit="): limit_str = spec[11:] # len('blob:limit=') == 11 if not limit_str: raise ValueError("blob:limit requires a size value (e.g., blob:limit=1m)") try: limit = _parse_size(limit_str) if limit < 0: raise ValueError( f"blob:limit size must be non-negative, got {limit_str}" ) return BlobLimitFilter(limit) except ValueError as e: raise ValueError(f"Invalid blob:limit specification: {e}") elif spec.startswith("tree:"): depth_str = spec[5:] # len('tree:') == 5 if not depth_str: raise ValueError("tree filter requires a depth value (e.g., tree:0)") try: depth = int(depth_str) if depth < 0: raise ValueError(f"tree depth must be non-negative, got {depth}") return TreeDepthFilter(depth) except ValueError as e: raise ValueError(f"Invalid tree filter: {e}") elif spec.startswith("sparse:oid="): oid_str = spec[11:] # len('sparse:oid=') == 11 if not oid_str: raise ValueError( "sparse:oid requires an object ID (e.g., sparse:oid=abc123...)" ) # Validate OID format (should be 40 hex chars for SHA-1 or 64 for SHA-256) if not valid_hexsha(oid_str): raise ValueError( f"sparse:oid requires a valid object ID (40 or 64 hex chars), got {len(oid_str)} chars" ) oid: ObjectID = ObjectID(oid_str.encode("ascii")) return SparseOidFilter(oid, object_store=object_store) elif spec.startswith("combine:"): filter_str = spec[8:] # len('combine:') == 8 if not filter_str: raise ValueError( "combine filter requires at least one filter (e.g., combine:blob:none+tree:0)" ) filter_specs = filter_str.split("+") if len(filter_specs) < 2: raise ValueError( "combine filter requires at least two filters separated by '+'" ) try: filters = [ parse_filter_spec(f, object_store=object_store) for f in filter_specs ] except ValueError as e: raise ValueError(f"Invalid filter in combine specification: {e}") return CombineFilter(filters) else: # Provide helpful error message with supported formats raise ValueError( f"Unknown filter specification: '{spec}'. " f"Supported formats: blob:none, blob:limit=[kmg], tree:, " f"sparse:oid=, combine:++..." ) def filter_pack_objects( object_store: "BaseObjectStore", object_ids: list["ObjectID"], filter_spec: FilterSpec, ) -> list["ObjectID"]: """Filter a list of object IDs based on a filter specification. This function examines each object and excludes those that don't match the filter criteria (e.g., blobs that are too large, trees beyond max depth). Args: object_store: Object store to retrieve objects from object_ids: List of object IDs to filter filter_spec: Filter specification to apply Returns: Filtered list of object IDs that should be included in the pack Note: This function currently supports blob size filtering. Tree depth filtering requires additional path/depth tracking which is not yet implemented. """ filtered_ids = [] for oid in object_ids: try: obj = object_store[oid] except KeyError: # Object not found, skip it continue # Determine object type and apply appropriate filter if isinstance(obj, Blob): # Check if blob should be included based on size blob_size = len(obj.data) if filter_spec.should_include_blob(blob_size): filtered_ids.append(oid) # else: blob is filtered out elif isinstance(obj, (Tree, Commit, Tag)): # For now, include all trees, commits, and tags # Tree depth filtering would require tracking depth during traversal # which needs to be implemented at the object collection stage if filter_spec.should_include_tree(0): # depth=0 for now filtered_ids.append(oid) else: # Unknown object type, include it to be safe filtered_ids.append(oid) return filtered_ids def filter_pack_objects_with_paths( object_store: "BaseObjectStore", wants: list["ObjectID"], filter_spec: FilterSpec, *, progress: "Callable[[bytes], None] | None" = None, ) -> list["ObjectID"]: """Filter objects for a pack with full path and depth tracking. This function performs a complete tree traversal starting from the wanted commits, tracking paths and depths to enable proper filtering for sparse:oid and tree: filters. Args: object_store: Object store to retrieve objects from wants: List of commit/tree/blob IDs that are wanted filter_spec: Filter specification to apply progress: Optional progress callback Returns: Filtered list of object IDs that should be included in the pack """ import stat included_objects: set[ObjectID] = set() # Track (oid, path, depth) tuples to process to_process: list[tuple[ObjectID, str, int]] = [] # Start with the wanted commits for want in wants: try: obj = object_store[want] except KeyError: continue if isinstance(obj, Commit): # Always include commits included_objects.add(want) # Add the root tree to process with depth 0 to_process.append((obj.tree, "", 0)) elif isinstance(obj, Tree): # Direct tree wants start at depth 0 to_process.append((want, "", 0)) elif isinstance(obj, Tag): # Always include tags included_objects.add(want) # Process the tagged object tagged_oid = obj.object[1] to_process.append((tagged_oid, "", 0)) elif isinstance(obj, Blob): # Direct blob wants - check size filter blob_size = len(obj.data) if filter_spec.should_include_blob(blob_size): included_objects.add(want) # Process trees and their contents processed_trees: set[ObjectID] = set() while to_process: oid, current_path, depth = to_process.pop() # Skip if already processed if oid in processed_trees: continue try: obj = object_store[oid] except KeyError: continue if isinstance(obj, Tree): # Check if this tree should be included based on depth if not filter_spec.should_include_tree(depth): continue # Include this tree included_objects.add(oid) processed_trees.add(oid) # Process tree entries for name, mode, entry_oid in obj.iteritems(): assert name is not None assert mode is not None assert entry_oid is not None # Skip gitlinks if S_ISGITLINK(mode): continue # Build full path if current_path: full_path = f"{current_path}/{name.decode('utf-8')}" else: full_path = name.decode("utf-8") if stat.S_ISDIR(mode): # It's a subdirectory - add to process list with increased depth to_process.append((entry_oid, full_path, depth + 1)) elif stat.S_ISREG(mode): # It's a blob - check filters try: blob = object_store[entry_oid] except KeyError: continue if not isinstance(blob, Blob): continue # Check filters blob_size = len(blob.data) # For non-path-based filters (size, blob:none), check directly if not filter_spec.should_include_blob(blob_size): continue # Check path filter for sparse:oid path_allowed = True if isinstance(filter_spec, SparseOidFilter): path_allowed = filter_spec.should_include_path(full_path) elif isinstance(filter_spec, CombineFilter): # Check path filters in combination for f in filter_spec.filters: if isinstance(f, SparseOidFilter): if not f.should_include_path(full_path): path_allowed = False break if not path_allowed: continue # Include this blob included_objects.add(entry_oid) elif isinstance(obj, Blob): # Standalone blob (shouldn't normally happen in tree traversal) blob_size = len(obj.data) if filter_spec.should_include_blob(blob_size): included_objects.add(oid) return list(included_objects) dulwich-1.0.0/dulwich/object_format.py000066400000000000000000000110661513301442600200010ustar00rootroot00000000000000# hash.py -- Object format abstraction layer for Git # Copyright (C) 2024 The Dulwich contributors # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Object format abstraction for Git objects. This module provides an abstraction layer for different object formats used in Git repositories (SHA-1 and SHA-256). """ from collections.abc import Callable from hashlib import sha1, sha256 from typing import TYPE_CHECKING if TYPE_CHECKING: from _hashlib import HASH class ObjectFormat: """Object format (hash algorithm) used in Git.""" def __init__( self, name: str, type_num: int, oid_length: int, hex_length: int, hash_func: Callable[[], "HASH"], ) -> None: """Initialize an object format. Args: name: Name of the format (e.g., "sha1", "sha256") type_num: Format type number used in Git oid_length: Length of the binary object ID in bytes hex_length: Length of the hexadecimal object ID in characters hash_func: Hash function from hashlib """ self.name = name self.type_num = type_num self.oid_length = oid_length self.hex_length = hex_length self.hash_func = hash_func def __str__(self) -> str: """Return string representation.""" return self.name def __repr__(self) -> str: """Return repr.""" return f"ObjectFormat({self.name!r})" def new_hash(self) -> "HASH": """Create a new hash object.""" return self.hash_func() def hash_object(self, data: bytes) -> bytes: """Hash data and return the digest. Args: data: Data to hash Returns: Binary digest """ h = self.new_hash() h.update(data) return h.digest() def hash_object_hex(self, data: bytes) -> bytes: """Hash data and return the hexadecimal digest. Args: data: Data to hash Returns: Hexadecimal digest as bytes """ h = self.new_hash() h.update(data) return h.hexdigest().encode("ascii") # Define the supported object formats SHA1 = ObjectFormat("sha1", type_num=1, oid_length=20, hex_length=40, hash_func=sha1) SHA256 = ObjectFormat( "sha256", type_num=20, oid_length=32, hex_length=64, hash_func=sha256 ) # Map of format names to ObjectFormat instances OBJECT_FORMATS = { "sha1": SHA1, "sha256": SHA256, } # Map of format numbers to ObjectFormat instances OBJECT_FORMAT_TYPE_NUMS = { 1: SHA1, 2: SHA256, } # Default format for backward compatibility DEFAULT_OBJECT_FORMAT = SHA1 def get_object_format(name: str | None = None) -> ObjectFormat: """Get an object format by name. Args: name: Format name ("sha1" or "sha256"). If None, returns default. Returns: ObjectFormat instance Raises: ValueError: If the format name is not supported """ if name is None: return DEFAULT_OBJECT_FORMAT try: return OBJECT_FORMATS[name.lower()] except KeyError: raise ValueError(f"Unsupported object format: {name}") def verify_same_object_format(*formats: ObjectFormat) -> ObjectFormat: """Verify that all provided object formats are the same. Args: *formats: Object format instances to verify Returns: The common object format Raises: ValueError: If formats don't match or no formats provided """ if not formats: raise ValueError("At least one object format must be provided") first = formats[0] for fmt in formats[1:]: if fmt != first: raise ValueError(f"Object format mismatch: {first.name} != {fmt.name}") return first dulwich-1.0.0/dulwich/object_store.py000066400000000000000000004056101513301442600176470ustar00rootroot00000000000000# object_store.py -- Object store for git objects # Copyright (C) 2008-2013 Jelmer Vernooij # and others # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Git object store interfaces and implementation.""" __all__ = [ "DEFAULT_TEMPFILE_GRACE_PERIOD", "INFODIR", "PACKDIR", "PACK_MODE", "BaseObjectStore", "BitmapReachability", "BucketBasedObjectStore", "DiskObjectStore", "GraphTraversalReachability", "GraphWalker", "MemoryObjectStore", "MissingObjectFinder", "ObjectIterator", "ObjectReachabilityProvider", "ObjectStoreGraphWalker", "OverlayObjectStore", "PackBasedObjectStore", "PackCapableObjectStore", "PackContainer", "commit_tree_changes", "find_shallow", "get_depth", "iter_commit_contents", "iter_tree_contents", "peel_sha", "read_packs_file", "tree_lookup_path", ] import binascii import os import stat import sys import time import warnings from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence, Set from contextlib import suppress from io import BytesIO from pathlib import Path from typing import ( TYPE_CHECKING, BinaryIO, Protocol, cast, ) if TYPE_CHECKING: from .object_format import ObjectFormat from .errors import NotTreeError from .file import GitFile, _GitFile from .midx import MultiPackIndex, load_midx from .objects import ( S_ISGITLINK, Blob, Commit, ObjectID, RawObjectID, ShaFile, Tag, Tree, TreeEntry, hex_to_filename, hex_to_sha, object_class, sha_to_hex, valid_hexsha, ) from .pack import ( PACK_SPOOL_FILE_MAX_SIZE, ObjectContainer, Pack, PackData, PackedObjectContainer, PackFileDisappeared, PackHint, PackIndexer, PackInflater, PackStreamCopier, UnpackedObject, extend_pack, full_unpacked_object, generate_unpacked_objects, iter_sha1, load_pack_index_file, pack_objects_to_data, write_pack_data, write_pack_index, ) from .protocol import DEPTH_INFINITE, PEELED_TAG_SUFFIX from .refs import Ref if TYPE_CHECKING: from .bitmap import EWAHBitmap from .commit_graph import CommitGraph from .config import Config from .diff_tree import RenameDetector from .pack import Pack class GraphWalker(Protocol): """Protocol for graph walker objects.""" def __next__(self) -> ObjectID | None: """Return the next object SHA to visit.""" ... def ack(self, sha: ObjectID) -> None: """Acknowledge that an object has been received.""" ... def nak(self) -> None: """Nothing in common was found.""" ... class ObjectReachabilityProvider(Protocol): """Protocol for computing object reachability queries. This abstraction allows reachability computations to be backed by either naive graph traversal or optimized bitmap indexes, with a consistent interface. """ def get_reachable_commits( self, heads: Iterable[ObjectID], exclude: Iterable[ObjectID] | None = None, shallow: Set[ObjectID] | None = None, ) -> set[ObjectID]: """Get all commits reachable from heads, excluding those in exclude. Args: heads: Starting commit SHAs exclude: Commit SHAs to exclude (and their ancestors) shallow: Set of shallow commit boundaries (traversal stops here) Returns: Set of commit SHAs reachable from heads but not from exclude """ ... def get_reachable_objects( self, commits: Iterable[ObjectID], exclude_commits: Iterable[ObjectID] | None = None, ) -> set[ObjectID]: """Get all objects (commits + trees + blobs) reachable from commits. Args: commits: Starting commit SHAs exclude_commits: Commits whose objects should be excluded Returns: Set of all object SHAs (commits, trees, blobs, tags) """ ... def get_tree_objects( self, tree_shas: Iterable[ObjectID], ) -> set[ObjectID]: """Get all trees and blobs reachable from the given trees. Args: tree_shas: Starting tree SHAs Returns: Set of tree and blob SHAs """ ... INFODIR = "info" PACKDIR = "pack" # use permissions consistent with Git; just readable by everyone # TODO: should packs also be non-writable on Windows? if so, that # would requite some rather significant adjustments to the test suite PACK_MODE = 0o444 if sys.platform != "win32" else 0o644 # Grace period for cleaning up temporary pack files (in seconds) # Matches git's default of 2 weeks DEFAULT_TEMPFILE_GRACE_PERIOD = 14 * 24 * 60 * 60 # 2 weeks def find_shallow( store: ObjectContainer, heads: Iterable[ObjectID], depth: int ) -> tuple[set[ObjectID], set[ObjectID]]: """Find shallow commits according to a given depth. Args: store: An ObjectStore for looking up objects. heads: Iterable of head SHAs to start walking from. depth: The depth of ancestors to include. A depth of one includes only the heads themselves. Returns: A tuple of (shallow, not_shallow), sets of SHAs that should be considered shallow and unshallow according to the arguments. Note that these sets may overlap if a commit is reachable along multiple paths. """ parents: dict[ObjectID, list[ObjectID]] = {} commit_graph = store.get_commit_graph() def get_parents(sha: ObjectID) -> list[ObjectID]: result = parents.get(sha, None) if not result: # Try to use commit graph first if available if commit_graph: graph_parents = commit_graph.get_parents(sha) if graph_parents is not None: result = graph_parents parents[sha] = result return result # Fall back to loading the object commit = store[sha] assert isinstance(commit, Commit) result = commit.parents parents[sha] = result return result todo = [] # stack of (sha, depth) for head_sha in heads: obj = store[head_sha] # Peel tags if necessary while isinstance(obj, Tag): _, sha = obj.object obj = store[sha] if isinstance(obj, Commit): todo.append((obj.id, 1)) not_shallow = set() shallow = set() while todo: sha, cur_depth = todo.pop() if cur_depth < depth: not_shallow.add(sha) new_depth = cur_depth + 1 todo.extend((p, new_depth) for p in get_parents(sha)) else: shallow.add(sha) return shallow, not_shallow def get_depth( store: ObjectContainer, head: ObjectID, get_parents: Callable[..., list[ObjectID]] = lambda commit: commit.parents, max_depth: int | None = None, ) -> int: """Return the current available depth for the given head. For commits with multiple parents, the largest possible depth will be returned. Args: store: Object store to search in head: commit to start from get_parents: optional function for getting the parents of a commit max_depth: maximum depth to search """ if head not in store: return 0 current_depth = 1 queue = [(head, current_depth)] commit_graph = store.get_commit_graph() while queue and (max_depth is None or current_depth < max_depth): e, depth = queue.pop(0) current_depth = max(current_depth, depth) # Try to use commit graph for parent lookup if available parents = None if commit_graph: parents = commit_graph.get_parents(e) if parents is None: # Fall back to loading the object cmt = store[e] if isinstance(cmt, Tag): _cls, sha = cmt.object cmt = store[sha] parents = get_parents(cmt) queue.extend((parent, depth + 1) for parent in parents if parent in store) return current_depth class PackContainer(Protocol): """Protocol for containers that can accept pack files.""" def add_pack(self) -> tuple[BytesIO, Callable[[], None], Callable[[], None]]: """Add a new pack.""" class BaseObjectStore: """Object store interface.""" def __init__(self, *, object_format: "ObjectFormat | None" = None) -> None: """Initialize object store. Args: object_format: Object format to use (defaults to DEFAULT_OBJECT_FORMAT) """ from .object_format import DEFAULT_OBJECT_FORMAT self.object_format = object_format if object_format else DEFAULT_OBJECT_FORMAT def determine_wants_all( self, refs: Mapping[Ref, ObjectID], depth: int | None = None ) -> list[ObjectID]: """Determine which objects are wanted based on refs.""" def _want_deepen(sha: ObjectID) -> bool: if not depth: return False if depth == DEPTH_INFINITE: return True return depth > self._get_depth(sha) return [ sha for (ref, sha) in refs.items() if (sha not in self or _want_deepen(sha)) and not ref.endswith(PEELED_TAG_SUFFIX) ] def contains_loose(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is loose.""" raise NotImplementedError(self.contains_loose) def contains_packed(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is packed.""" return False # Default implementation for stores that don't support packing def __contains__(self, sha1: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1. This method makes no distinction between loose and packed objects. """ return self.contains_loose(sha1) @property def packs(self) -> list[Pack]: """Iterable of pack objects.""" raise NotImplementedError def get_raw(self, name: RawObjectID | ObjectID) -> tuple[int, bytes]: """Obtain the raw text for an object. Args: name: sha for the object. Returns: tuple with numeric type and object contents. """ raise NotImplementedError(self.get_raw) def __getitem__(self, sha1: ObjectID | RawObjectID) -> ShaFile: """Obtain an object by SHA1.""" type_num, uncomp = self.get_raw(sha1) return ShaFile.from_raw_string( type_num, uncomp, sha=sha1, object_format=self.object_format ) def __iter__(self) -> Iterator[ObjectID]: """Iterate over the SHAs that are present in this store.""" raise NotImplementedError(self.__iter__) def add_object(self, obj: ShaFile) -> None: """Add a single object to this object store.""" raise NotImplementedError(self.add_object) def add_objects( self, objects: Sequence[tuple[ShaFile, str | None]], progress: Callable[..., None] | None = None, ) -> "Pack | None": """Add a set of objects to this object store. Args: objects: Iterable over a list of (object, path) tuples progress: Optional progress callback """ raise NotImplementedError(self.add_objects) def get_reachability_provider( self, prefer_bitmap: bool = True ) -> ObjectReachabilityProvider: """Get a reachability provider for this object store. Returns an ObjectReachabilityProvider that can efficiently compute object reachability queries. Subclasses can override this to provide optimized implementations (e.g., using bitmap indexes). Args: prefer_bitmap: Whether to prefer bitmap-based reachability if available. Returns: ObjectReachabilityProvider instance """ return GraphTraversalReachability(self) def tree_changes( self, source: ObjectID | None, target: ObjectID | None, want_unchanged: bool = False, include_trees: bool = False, change_type_same: bool = False, rename_detector: "RenameDetector | None" = None, paths: Sequence[bytes] | None = None, ) -> Iterator[ tuple[ tuple[bytes | None, bytes | None], tuple[int | None, int | None], tuple[ObjectID | None, ObjectID | None], ] ]: """Find the differences between the contents of two trees. Args: source: SHA1 of the source tree target: SHA1 of the target tree want_unchanged: Whether unchanged files should be reported include_trees: Whether to include trees change_type_same: Whether to report files changing type in the same entry. rename_detector: RenameDetector object for detecting renames. paths: Optional list of paths to filter to (as bytes). Returns: Iterator over tuples with (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) """ from .diff_tree import tree_changes for change in tree_changes( self, source, target, want_unchanged=want_unchanged, include_trees=include_trees, change_type_same=change_type_same, rename_detector=rename_detector, paths=paths, ): old_path = change.old.path if change.old is not None else None new_path = change.new.path if change.new is not None else None old_mode = change.old.mode if change.old is not None else None new_mode = change.new.mode if change.new is not None else None old_sha = change.old.sha if change.old is not None else None new_sha = change.new.sha if change.new is not None else None yield ( (old_path, new_path), (old_mode, new_mode), (old_sha, new_sha), ) def iter_tree_contents( self, tree_id: ObjectID, include_trees: bool = False ) -> Iterator[TreeEntry]: """Iterate the contents of a tree and all subtrees. Iteration is depth-first pre-order, as in e.g. os.walk. Args: tree_id: SHA1 of the tree. include_trees: If True, include tree objects in the iteration. Returns: Iterator over TreeEntry namedtuples for all the objects in a tree. """ warnings.warn( "Please use dulwich.object_store.iter_tree_contents", DeprecationWarning, stacklevel=2, ) return iter_tree_contents(self, tree_id, include_trees=include_trees) def iterobjects_subset( self, shas: Iterable[ObjectID], *, allow_missing: bool = False ) -> Iterator[ShaFile]: """Iterate over a subset of objects in the store. Args: shas: Iterable of object SHAs to retrieve allow_missing: If True, skip missing objects; if False, raise KeyError Returns: Iterator of ShaFile objects Raises: KeyError: If an object is missing and allow_missing is False """ for sha in shas: try: yield self[sha] except KeyError: if not allow_missing: raise def iter_unpacked_subset( self, shas: Iterable[ObjectID | RawObjectID], include_comp: bool = False, allow_missing: bool = False, convert_ofs_delta: bool = True, ) -> "Iterator[UnpackedObject]": """Iterate over unpacked objects for a subset of SHAs. Default implementation that converts ShaFile objects to UnpackedObject. Subclasses may override for more efficient unpacked access. Args: shas: Iterable of object SHAs to retrieve include_comp: Whether to include compressed data (ignored in base implementation) allow_missing: If True, skip missing objects; if False, raise KeyError convert_ofs_delta: Whether to convert OFS_DELTA objects (ignored in base implementation) Returns: Iterator of UnpackedObject instances Raises: KeyError: If an object is missing and allow_missing is False """ from .pack import UnpackedObject for sha in shas: try: obj = self[sha] # Convert ShaFile to UnpackedObject unpacked = UnpackedObject( obj.type_num, decomp_chunks=obj.as_raw_chunks(), sha=obj.id ) yield unpacked except KeyError: if not allow_missing: raise def find_missing_objects( self, haves: Iterable[ObjectID], wants: Iterable[ObjectID], shallow: Set[ObjectID] | None = None, progress: Callable[..., None] | None = None, get_tagged: Callable[[], dict[ObjectID, ObjectID]] | None = None, get_parents: Callable[..., list[ObjectID]] = lambda commit: commit.parents, ) -> Iterator[tuple[ObjectID, PackHint | None]]: """Find the missing objects required for a set of revisions. Args: haves: Iterable over SHAs already in common. wants: Iterable over SHAs of objects to fetch. shallow: Set of shallow commit SHA1s to skip progress: Simple progress function that will be called with updated progress strings. get_tagged: Function that returns a dict of pointed-to sha -> tag sha for including tags. get_parents: Optional function for getting the parents of a commit. Returns: Iterator over (sha, path) pairs. """ warnings.warn("Please use MissingObjectFinder(store)", DeprecationWarning) finder = MissingObjectFinder( self, haves=haves, wants=wants, shallow=shallow, progress=progress, get_tagged=get_tagged, get_parents=get_parents, ) return iter(finder) def find_common_revisions(self, graphwalker: GraphWalker) -> list[ObjectID]: """Find which revisions this store has in common using graphwalker. Args: graphwalker: A graphwalker object. Returns: List of SHAs that are in common """ haves = [] sha = next(graphwalker) while sha: if sha in self: haves.append(sha) graphwalker.ack(sha) sha = next(graphwalker) return haves def generate_pack_data( self, have: Iterable[ObjectID], want: Iterable[ObjectID], *, shallow: Set[ObjectID] | None = None, progress: Callable[..., None] | None = None, ofs_delta: bool = True, ) -> tuple[int, Iterator[UnpackedObject]]: """Generate pack data objects for a set of wants/haves. Args: have: List of SHA1s of objects that should not be sent want: List of SHA1s of objects that should be sent shallow: Set of shallow commit SHA1s to skip ofs_delta: Whether OFS deltas can be included progress: Optional progress reporting method """ # Note that the pack-specific implementation below is more efficient, # as it reuses deltas missing_objects = MissingObjectFinder( self, haves=have, wants=want, shallow=shallow, progress=progress ) object_ids = list(missing_objects) return pack_objects_to_data( [(self[oid], path) for oid, path in object_ids], ofs_delta=ofs_delta, progress=progress, ) def peel_sha(self, sha: ObjectID | RawObjectID) -> ObjectID: """Peel all tags from a SHA. Args: sha: The object SHA to peel. Returns: The fully-peeled SHA1 of a tag object, after peeling all intermediate tags; if the original ref does not point to a tag, this will equal the original SHA1. """ warnings.warn( "Please use dulwich.object_store.peel_sha()", DeprecationWarning, stacklevel=2, ) return peel_sha(self, sha)[1].id def _get_depth( self, head: ObjectID, get_parents: Callable[..., list[ObjectID]] = lambda commit: commit.parents, max_depth: int | None = None, ) -> int: """Return the current available depth for the given head. For commits with multiple parents, the largest possible depth will be returned. Args: head: commit to start from get_parents: optional function for getting the parents of a commit max_depth: maximum depth to search """ return get_depth(self, head, get_parents=get_parents, max_depth=max_depth) def close(self) -> None: """Close any files opened by this object store.""" # Default implementation is a NO-OP def prune(self, grace_period: int | None = None) -> None: """Prune/clean up this object store. This includes removing orphaned temporary files and other housekeeping tasks. Default implementation is a NO-OP. Args: grace_period: Grace period in seconds for removing temporary files. If None, uses the default grace period. """ # Default implementation is a NO-OP def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]: """Iterate over all SHA1s that start with a given prefix. The default implementation is a naive iteration over all objects. However, subclasses may override this method with more efficient implementations. """ for sha in self: if sha.startswith(prefix): yield sha def get_commit_graph(self) -> "CommitGraph | None": """Get the commit graph for this object store. Returns: CommitGraph object if available, None otherwise """ return None def write_commit_graph( self, refs: Iterable[ObjectID] | None = None, reachable: bool = True ) -> None: """Write a commit graph file for this object store. Args: refs: List of refs to include. If None, includes all refs from object store. reachable: If True, includes all commits reachable from refs. If False, only includes the direct ref targets. Note: Default implementation does nothing. Subclasses should override this method to provide commit graph writing functionality. """ raise NotImplementedError(self.write_commit_graph) def get_object_mtime(self, sha: ObjectID) -> float: """Get the modification time of an object. Args: sha: SHA1 of the object Returns: Modification time as seconds since epoch Raises: KeyError: if the object is not found """ # Default implementation raises KeyError # Subclasses should override to provide actual mtime raise KeyError(sha) class PackCapableObjectStore(BaseObjectStore, PackedObjectContainer): """Object store that supports pack operations. This is a base class for object stores that can handle pack files, including both disk-based and memory-based stores. """ def add_pack(self) -> tuple[BinaryIO, Callable[[], None], Callable[[], None]]: """Add a new pack to this object store. Returns: Tuple of (file, commit_func, abort_func) """ raise NotImplementedError(self.add_pack) def add_pack_data( self, count: int, unpacked_objects: Iterator["UnpackedObject"], progress: Callable[..., None] | None = None, ) -> "Pack | None": """Add pack data to this object store. Args: count: Number of objects unpacked_objects: Iterator over unpacked objects progress: Optional progress callback """ raise NotImplementedError(self.add_pack_data) def get_unpacked_object( self, sha1: ObjectID | RawObjectID, *, include_comp: bool = False ) -> "UnpackedObject": """Get a raw unresolved object. Args: sha1: SHA-1 hash of the object include_comp: Whether to include compressed data Returns: UnpackedObject instance """ from .pack import UnpackedObject obj = self[sha1] return UnpackedObject(obj.type_num, sha=sha1, decomp_chunks=obj.as_raw_chunks()) def iterobjects_subset( self, shas: Iterable[ObjectID], *, allow_missing: bool = False ) -> Iterator[ShaFile]: """Iterate over a subset of objects. Args: shas: Iterable of object SHAs to retrieve allow_missing: If True, skip missing objects Returns: Iterator of ShaFile objects """ for sha in shas: try: yield self[sha] except KeyError: if not allow_missing: raise class PackBasedObjectStore(PackCapableObjectStore, PackedObjectContainer): """Object store that uses pack files for storage. This class provides a base implementation for object stores that use Git pack files as their primary storage mechanism. It handles caching of open pack files and provides configuration for pack file operations. """ def __init__( self, pack_compression_level: int = -1, pack_index_version: int | None = None, pack_delta_window_size: int | None = None, pack_window_memory: int | None = None, pack_delta_cache_size: int | None = None, pack_depth: int | None = None, pack_threads: int | None = None, pack_big_file_threshold: int | None = None, *, object_format: "ObjectFormat | None" = None, ) -> None: """Initialize a PackBasedObjectStore. Args: pack_compression_level: Compression level for pack files (-1 to 9) pack_index_version: Pack index version to use pack_delta_window_size: Window size for delta compression pack_window_memory: Maximum memory to use for delta window pack_delta_cache_size: Cache size for delta operations pack_depth: Maximum depth for pack deltas pack_threads: Number of threads to use for packing pack_big_file_threshold: Threshold for treating files as "big" object_format: Hash algorithm to use """ super().__init__(object_format=object_format) self._pack_cache: dict[str, Pack] = {} self.pack_compression_level = pack_compression_level self.pack_index_version = pack_index_version self.pack_delta_window_size = pack_delta_window_size self.pack_window_memory = pack_window_memory self.pack_delta_cache_size = pack_delta_cache_size self.pack_depth = pack_depth self.pack_threads = pack_threads self.pack_big_file_threshold = pack_big_file_threshold def get_reachability_provider( self, prefer_bitmaps: bool = True, ) -> ObjectReachabilityProvider: """Get the best reachability provider for the object store. Args: prefer_bitmaps: Whether to use bitmaps if available Returns: ObjectReachabilityProvider implementation (either bitmap-accelerated or graph traversal) """ if prefer_bitmaps: # Check if any packs have bitmaps has_bitmap = False for pack in self.packs: try: # Try to access bitmap property if pack.bitmap is not None: has_bitmap = True break except FileNotFoundError: # Bitmap file doesn't exist for this pack continue if has_bitmap: return BitmapReachability(self) # Fall back to graph traversal return GraphTraversalReachability(self) def add_pack(self) -> tuple[BinaryIO, Callable[[], None], Callable[[], None]]: """Add a new pack to this object store.""" raise NotImplementedError(self.add_pack) def add_pack_data( self, count: int, unpacked_objects: Iterator[UnpackedObject], progress: Callable[..., None] | None = None, ) -> "Pack | None": """Add pack data to this object store. Args: count: Number of items to add unpacked_objects: Iterator of UnpackedObject instances progress: Optional progress callback """ if count == 0: # Don't bother writing an empty pack file return None f, commit, abort = self.add_pack() try: write_pack_data( f.write, unpacked_objects, num_records=count, progress=progress, compression_level=self.pack_compression_level, object_format=self.object_format, ) except BaseException: abort() raise else: return commit() @property def alternates(self) -> list["BaseObjectStore"]: """Return list of alternate object stores.""" return [] def contains_packed(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is packed. This does not check alternates. """ for pack in self.packs: try: if sha in pack: return True except PackFileDisappeared: pass return False def __contains__(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1. This method makes no distinction between loose and packed objects. """ if self.contains_packed(sha) or self.contains_loose(sha): return True for alternate in self.alternates: if sha in alternate: return True return False def _add_cached_pack(self, base_name: str, pack: Pack) -> None: """Add a newly appeared pack to the cache by path.""" prev_pack = self._pack_cache.get(base_name) if prev_pack is not pack: self._pack_cache[base_name] = pack if prev_pack: prev_pack.close() def generate_pack_data( self, have: Iterable[ObjectID], want: Iterable[ObjectID], *, shallow: Set[ObjectID] | None = None, progress: Callable[..., None] | None = None, ofs_delta: bool = True, ) -> tuple[int, Iterator[UnpackedObject]]: """Generate pack data objects for a set of wants/haves. Args: have: List of SHA1s of objects that should not be sent want: List of SHA1s of objects that should be sent shallow: Set of shallow commit SHA1s to skip ofs_delta: Whether OFS deltas can be included progress: Optional progress reporting method """ missing_objects = MissingObjectFinder( self, haves=have, wants=want, shallow=shallow, progress=progress ) remote_has = missing_objects.get_remote_has() object_ids = list(missing_objects) return len(object_ids), generate_unpacked_objects( self, object_ids, progress=progress, ofs_delta=ofs_delta, other_haves=remote_has, ) def _clear_cached_packs(self) -> None: pack_cache = self._pack_cache self._pack_cache = {} while pack_cache: (_name, pack) = pack_cache.popitem() pack.close() def _iter_cached_packs(self) -> Iterator[Pack]: return iter(self._pack_cache.values()) def _update_pack_cache(self) -> list[Pack]: raise NotImplementedError(self._update_pack_cache) def close(self) -> None: """Close the object store and release resources. This method closes all cached pack files and frees associated resources. Can be called multiple times safely. """ self._clear_cached_packs() def __del__(self) -> None: """Warn if the object store is being deleted with unclosed packs.""" if self._pack_cache: import warnings warnings.warn( f"ObjectStore {self!r} was destroyed with {len(self._pack_cache)} " "unclosed pack(s). Please call close() explicitly.", ResourceWarning, stacklevel=2, ) self.close() @property def packs(self) -> list[Pack]: """List with pack objects.""" return list(self._iter_cached_packs()) + list(self._update_pack_cache()) def count_pack_files(self) -> int: """Count the number of pack files. Returns: Number of pack files (excluding those with .keep files) """ count = 0 for pack in self.packs: # Check if there's a .keep file for this pack keep_path = pack._basename + ".keep" if not os.path.exists(keep_path): count += 1 return count def _iter_alternate_objects(self) -> Iterator[ObjectID]: """Iterate over the SHAs of all the objects in alternate stores.""" for alternate in self.alternates: yield from alternate def _iter_loose_objects(self) -> Iterator[ObjectID]: """Iterate over the SHAs of all loose objects.""" raise NotImplementedError(self._iter_loose_objects) def _get_loose_object(self, sha: ObjectID | RawObjectID) -> ShaFile | None: raise NotImplementedError(self._get_loose_object) def delete_loose_object(self, sha: ObjectID) -> None: """Delete a loose object. This method only handles loose objects. For packed objects, use repack(exclude=...) to exclude them during repacking. """ raise NotImplementedError(self.delete_loose_object) def _remove_pack(self, pack: "Pack") -> None: raise NotImplementedError(self._remove_pack) def pack_loose_objects(self, progress: Callable[[str], None] | None = None) -> int: """Pack loose objects. Args: progress: Optional progress reporting callback Returns: Number of objects packed """ objects: list[tuple[ShaFile, None]] = [] for sha in self._iter_loose_objects(): obj = self._get_loose_object(sha) if obj is not None: objects.append((obj, None)) self.add_objects(objects, progress=progress) for obj, path in objects: self.delete_loose_object(obj.id) return len(objects) def repack( self, exclude: Set[bytes] | None = None, progress: Callable[[str], None] | None = None, ) -> int: """Repack the packs in this repository. Note that this implementation is fairly naive and currently keeps all objects in memory while it repacks. Args: exclude: Optional set of object SHAs to exclude from repacking progress: Optional progress reporting callback """ if exclude is None: exclude = set() loose_objects = set() excluded_loose_objects = set() for sha in self._iter_loose_objects(): if sha not in exclude: obj = self._get_loose_object(sha) if obj is not None: loose_objects.add(obj) else: excluded_loose_objects.add(sha) objects: set[tuple[ShaFile, None]] = {(obj, None) for obj in loose_objects} old_packs = {p.name(): p for p in self.packs} for name, pack in old_packs.items(): objects.update( (obj, None) for obj in pack.iterobjects() if obj.id not in exclude ) # Only create a new pack if there are objects to pack if objects: # The name of the consolidated pack might match the name of a # pre-existing pack. Take care not to remove the newly created # consolidated pack. consolidated = self.add_objects(list(objects), progress=progress) if consolidated is not None: old_packs.pop(consolidated.name(), None) # Delete loose objects that were packed for obj in loose_objects: if obj is not None: self.delete_loose_object(obj.id) # Delete excluded loose objects for sha in excluded_loose_objects: self.delete_loose_object(sha) for name, pack in old_packs.items(): self._remove_pack(pack) self._update_pack_cache() return len(objects) def generate_pack_bitmaps( self, refs: dict[Ref, ObjectID], *, commit_interval: int | None = None, progress: Callable[[str], None] | None = None, ) -> int: """Generate bitmap indexes for all packs that don't have them. This generates .bitmap files for packfiles, enabling fast reachability queries. Equivalent to the bitmap generation part of 'git repack -b'. Args: refs: Dictionary of ref names to commit SHAs commit_interval: Include every Nth commit in bitmap index (None for default) progress: Optional progress reporting callback Returns: Number of bitmaps generated """ count = 0 for pack in self.packs: pack.ensure_bitmap( self, refs, commit_interval=commit_interval, progress=progress ) count += 1 # Update cache to pick up new bitmaps self._update_pack_cache() return count def __iter__(self) -> Iterator[ObjectID]: """Iterate over the SHAs that are present in this store.""" self._update_pack_cache() for pack in self._iter_cached_packs(): try: yield from pack except PackFileDisappeared: pass yield from self._iter_loose_objects() yield from self._iter_alternate_objects() def contains_loose(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is loose. This does not check alternates. """ return self._get_loose_object(sha) is not None def get_raw(self, name: RawObjectID | ObjectID) -> tuple[int, bytes]: """Obtain the raw fulltext for an object. Args: name: sha for the object. Returns: tuple with numeric type and object contents. """ sha: RawObjectID if len(name) == self.object_format.hex_length: sha = hex_to_sha(ObjectID(name)) hexsha = name elif len(name) == self.object_format.oid_length: sha = RawObjectID(name) hexsha = None else: raise AssertionError(f"Invalid object name {name!r}") for pack in self._iter_cached_packs(): try: return pack.get_raw(sha) except (KeyError, PackFileDisappeared): pass if hexsha is None: hexsha = sha_to_hex(sha) ret = self._get_loose_object(hexsha) if ret is not None: return ret.type_num, ret.as_raw_string() # Maybe something else has added a pack with the object # in the mean time? for pack in self._update_pack_cache(): try: return pack.get_raw(sha) except KeyError: pass for alternate in self.alternates: try: return alternate.get_raw(hexsha) except KeyError: pass raise KeyError(hexsha) def iter_unpacked_subset( self, shas: Iterable[ObjectID | RawObjectID], include_comp: bool = False, allow_missing: bool = False, convert_ofs_delta: bool = True, ) -> Iterator[UnpackedObject]: """Iterate over a subset of objects, yielding UnpackedObject instances. Args: shas: Set of object SHAs to retrieve include_comp: Whether to include compressed data allow_missing: If True, skip missing objects; if False, raise KeyError convert_ofs_delta: Whether to convert OFS_DELTA objects Returns: Iterator of UnpackedObject instances Raises: KeyError: If an object is missing and allow_missing is False """ todo: set[ObjectID | RawObjectID] = set(shas) for p in self._iter_cached_packs(): for unpacked in p.iter_unpacked_subset( todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta, ): yield unpacked hexsha = sha_to_hex(unpacked.sha()) todo.remove(hexsha) # Maybe something else has added a pack with the object # in the mean time? for p in self._update_pack_cache(): for unpacked in p.iter_unpacked_subset( todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta, ): yield unpacked hexsha = sha_to_hex(unpacked.sha()) todo.remove(hexsha) for alternate in self.alternates: assert isinstance(alternate, PackBasedObjectStore) for unpacked in alternate.iter_unpacked_subset( todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta, ): yield unpacked hexsha = sha_to_hex(unpacked.sha()) todo.remove(hexsha) def iterobjects_subset( self, shas: Iterable[ObjectID], *, allow_missing: bool = False ) -> Iterator[ShaFile]: """Iterate over a subset of objects in the store. This method searches for objects in pack files, alternates, and loose storage. Args: shas: Iterable of object SHAs to retrieve allow_missing: If True, skip missing objects; if False, raise KeyError Returns: Iterator of ShaFile objects Raises: KeyError: If an object is missing and allow_missing is False """ todo: set[ObjectID] = set(shas) for p in self._iter_cached_packs(): for o in p.iterobjects_subset(todo, allow_missing=True): yield o todo.remove(o.id) # Maybe something else has added a pack with the object # in the mean time? for p in self._update_pack_cache(): for o in p.iterobjects_subset(todo, allow_missing=True): yield o todo.remove(o.id) for alternate in self.alternates: for o in alternate.iterobjects_subset(todo, allow_missing=True): yield o todo.remove(o.id) for oid in todo: loose_obj: ShaFile | None = self._get_loose_object(oid) if loose_obj is not None: yield loose_obj elif not allow_missing: raise KeyError(oid) def get_unpacked_object( self, sha1: bytes, *, include_comp: bool = False ) -> UnpackedObject: """Obtain the unpacked object. Args: sha1: sha for the object. include_comp: Whether to include compression metadata. """ if len(sha1) == self.object_format.hex_length: sha = hex_to_sha(cast(ObjectID, sha1)) hexsha = cast(ObjectID, sha1) elif len(sha1) == self.object_format.oid_length: sha = cast(RawObjectID, sha1) hexsha = None else: raise AssertionError(f"Invalid object sha1 {sha1!r}") for pack in self._iter_cached_packs(): try: return pack.get_unpacked_object(sha, include_comp=include_comp) except (KeyError, PackFileDisappeared): pass if hexsha is None: hexsha = sha_to_hex(sha) # Maybe something else has added a pack with the object # in the mean time? for pack in self._update_pack_cache(): try: return pack.get_unpacked_object(sha, include_comp=include_comp) except KeyError: pass for alternate in self.alternates: assert isinstance(alternate, PackBasedObjectStore) try: return alternate.get_unpacked_object(hexsha, include_comp=include_comp) except KeyError: pass raise KeyError(hexsha) def add_objects( self, objects: Sequence[tuple[ShaFile, str | None]], progress: Callable[[str], None] | None = None, ) -> "Pack | None": """Add a set of objects to this object store. Args: objects: Iterable over (object, path) tuples, should support __len__. progress: Optional progress reporting function. Returns: Pack object of the objects written. """ count = len(objects) record_iter = (full_unpacked_object(o) for (o, p) in objects) return self.add_pack_data(count, record_iter, progress=progress) class DiskObjectStore(PackBasedObjectStore): """Git-style object store that exists on disk.""" path: str | os.PathLike[str] pack_dir: str | os.PathLike[str] _alternates: "list[BaseObjectStore] | None" _commit_graph: "CommitGraph | None" def __init__( self, path: str | os.PathLike[str], *, loose_compression_level: int = -1, pack_compression_level: int = -1, pack_index_version: int | None = None, pack_delta_window_size: int | None = None, pack_window_memory: int | None = None, pack_delta_cache_size: int | None = None, pack_depth: int | None = None, pack_threads: int | None = None, pack_big_file_threshold: int | None = None, fsync_object_files: bool = False, pack_write_bitmaps: bool = False, pack_write_bitmap_hash_cache: bool = True, pack_write_bitmap_lookup_table: bool = True, file_mode: int | None = None, dir_mode: int | None = None, object_format: "ObjectFormat | None" = None, ) -> None: """Open an object store. Args: path: Path of the object store. loose_compression_level: zlib compression level for loose objects pack_compression_level: zlib compression level for pack objects pack_index_version: pack index version to use (1, 2, or 3) pack_delta_window_size: sliding window size for delta compression pack_window_memory: memory limit for delta window operations pack_delta_cache_size: size of cache for delta operations pack_depth: maximum delta chain depth pack_threads: number of threads for pack operations pack_big_file_threshold: threshold for treating files as big fsync_object_files: whether to fsync object files for durability pack_write_bitmaps: whether to write bitmap indexes for packs pack_write_bitmap_hash_cache: whether to include name-hash cache in bitmaps pack_write_bitmap_lookup_table: whether to include lookup table in bitmaps file_mode: File permission mask for shared repository dir_mode: Directory permission mask for shared repository object_format: Hash algorithm to use (SHA1 or SHA256) """ # Import here to avoid circular dependency from .object_format import DEFAULT_OBJECT_FORMAT super().__init__( pack_compression_level=pack_compression_level, pack_index_version=pack_index_version, pack_delta_window_size=pack_delta_window_size, pack_window_memory=pack_window_memory, pack_delta_cache_size=pack_delta_cache_size, pack_depth=pack_depth, pack_threads=pack_threads, pack_big_file_threshold=pack_big_file_threshold, object_format=object_format if object_format else DEFAULT_OBJECT_FORMAT, ) self.path = path self.pack_dir = os.path.join(self.path, PACKDIR) self._alternates = None self.loose_compression_level = loose_compression_level self.pack_compression_level = pack_compression_level self.pack_index_version = pack_index_version self.fsync_object_files = fsync_object_files self.pack_write_bitmaps = pack_write_bitmaps self.pack_write_bitmap_hash_cache = pack_write_bitmap_hash_cache self.pack_write_bitmap_lookup_table = pack_write_bitmap_lookup_table self.file_mode = file_mode self.dir_mode = dir_mode # Commit graph support - lazy loaded self._commit_graph = None self._use_commit_graph = True # Default to true # Multi-pack-index support - lazy loaded self._midx: MultiPackIndex | None = None self._use_midx = True # Default to true def __repr__(self) -> str: """Return string representation of DiskObjectStore. Returns: String representation including the store path """ return f"<{self.__class__.__name__}({self.path!r})>" @classmethod def from_config( cls, path: str | os.PathLike[str], config: "Config", *, file_mode: int | None = None, dir_mode: int | None = None, ) -> "DiskObjectStore": """Create a DiskObjectStore from a configuration object. Args: path: Path to the object store directory config: Configuration object to read settings from file_mode: Optional file permission mask for shared repository dir_mode: Optional directory permission mask for shared repository Returns: New DiskObjectStore instance configured according to config """ try: default_compression_level = int( config.get((b"core",), b"compression").decode() ) except KeyError: default_compression_level = -1 try: loose_compression_level = int( config.get((b"core",), b"looseCompression").decode() ) except KeyError: loose_compression_level = default_compression_level try: pack_compression_level = int( config.get((b"core",), "packCompression").decode() ) except KeyError: pack_compression_level = default_compression_level try: pack_index_version = int(config.get((b"pack",), b"indexVersion").decode()) except KeyError: pack_index_version = None # Read pack configuration options try: pack_delta_window_size = int( config.get((b"pack",), b"deltaWindowSize").decode() ) except KeyError: pack_delta_window_size = None try: pack_window_memory = int(config.get((b"pack",), b"windowMemory").decode()) except KeyError: pack_window_memory = None try: pack_delta_cache_size = int( config.get((b"pack",), b"deltaCacheSize").decode() ) except KeyError: pack_delta_cache_size = None try: pack_depth = int(config.get((b"pack",), b"depth").decode()) except KeyError: pack_depth = None try: pack_threads = int(config.get((b"pack",), b"threads").decode()) except KeyError: pack_threads = None try: pack_big_file_threshold = int( config.get((b"pack",), b"bigFileThreshold").decode() ) except KeyError: pack_big_file_threshold = None # Read core.commitGraph setting use_commit_graph = config.get_boolean((b"core",), b"commitGraph", True) # Read core.multiPackIndex setting use_midx = config.get_boolean((b"core",), b"multiPackIndex", True) # Read core.fsyncObjectFiles setting fsync_object_files = config.get_boolean((b"core",), b"fsyncObjectFiles", False) # Read bitmap settings pack_write_bitmaps = config.get_boolean((b"pack",), b"writeBitmaps", False) pack_write_bitmap_hash_cache = config.get_boolean( (b"pack",), b"writeBitmapHashCache", True ) pack_write_bitmap_lookup_table = config.get_boolean( (b"pack",), b"writeBitmapLookupTable", True ) # Also check repack.writeBitmaps for backwards compatibility if not pack_write_bitmaps: pack_write_bitmaps = config.get_boolean( (b"repack",), b"writeBitmaps", False ) # Get hash algorithm from config from .object_format import get_object_format object_format = None try: try: version = int(config.get((b"core",), b"repositoryformatversion")) except KeyError: version = 0 if version == 1: try: object_format_name = config.get((b"extensions",), b"objectformat") except KeyError: object_format_name = b"sha1" object_format = get_object_format(object_format_name.decode("ascii")) except (KeyError, ValueError): pass instance = cls( path, loose_compression_level=loose_compression_level, pack_compression_level=pack_compression_level, pack_index_version=pack_index_version, pack_delta_window_size=pack_delta_window_size, pack_window_memory=pack_window_memory, pack_delta_cache_size=pack_delta_cache_size, pack_depth=pack_depth, pack_threads=pack_threads, pack_big_file_threshold=pack_big_file_threshold, fsync_object_files=fsync_object_files, pack_write_bitmaps=pack_write_bitmaps, pack_write_bitmap_hash_cache=pack_write_bitmap_hash_cache, pack_write_bitmap_lookup_table=pack_write_bitmap_lookup_table, file_mode=file_mode, dir_mode=dir_mode, object_format=object_format, ) instance._use_commit_graph = use_commit_graph instance._use_midx = use_midx return instance @property def alternates(self) -> list["BaseObjectStore"]: """Get the list of alternate object stores. Reads from .git/objects/info/alternates if not already cached. Returns: List of DiskObjectStore instances for alternate object directories """ if self._alternates is not None: return self._alternates self._alternates = [] for path in self._read_alternate_paths(): self._alternates.append(DiskObjectStore(path)) return self._alternates def _read_alternate_paths(self) -> Iterator[str]: try: f = GitFile(os.path.join(self.path, INFODIR, "alternates"), "rb") except FileNotFoundError: return with f: for line in f.readlines(): line = line.rstrip(b"\n") if line.startswith(b"#"): continue if os.path.isabs(line): yield os.fsdecode(line) else: yield os.fsdecode(os.path.join(os.fsencode(self.path), line)) def add_alternate_path(self, path: str | os.PathLike[str]) -> None: """Add an alternate path to this object store.""" info_dir = os.path.join(self.path, INFODIR) try: os.mkdir(info_dir) if self.dir_mode is not None: os.chmod(info_dir, self.dir_mode) except FileExistsError: pass alternates_path = os.path.join(self.path, INFODIR, "alternates") mask = self.file_mode if self.file_mode is not None else 0o644 with GitFile(alternates_path, "wb", mask=mask) as f: try: orig_f = open(alternates_path, "rb") except FileNotFoundError: pass else: with orig_f: f.write(orig_f.read()) f.write(os.fsencode(path) + b"\n") if not os.path.isabs(path): path = os.path.join(self.path, path) self.alternates.append(DiskObjectStore(path)) def _update_pack_cache(self) -> list[Pack]: """Read and iterate over new pack files and cache them.""" try: pack_dir_contents = os.listdir(self.pack_dir) except FileNotFoundError: return [] pack_files = set() for name in pack_dir_contents: if name.startswith("pack-") and name.endswith(".pack"): # verify that idx exists first (otherwise the pack was not yet # fully written) idx_name = os.path.splitext(name)[0] + ".idx" if idx_name in pack_dir_contents: # Extract just the hash (remove "pack-" prefix and ".pack" suffix) pack_hash = name[len("pack-") : -len(".pack")] pack_files.add(pack_hash) # Open newly appeared pack files new_packs = [] for pack_hash in pack_files: if pack_hash not in self._pack_cache: pack = Pack( os.path.join(self.pack_dir, "pack-" + pack_hash), object_format=self.object_format, delta_window_size=self.pack_delta_window_size, window_memory=self.pack_window_memory, delta_cache_size=self.pack_delta_cache_size, depth=self.pack_depth, threads=self.pack_threads, big_file_threshold=self.pack_big_file_threshold, ) new_packs.append(pack) self._pack_cache[pack_hash] = pack # Remove disappeared pack files for f in set(self._pack_cache) - pack_files: self._pack_cache.pop(f).close() return new_packs def _get_shafile_path(self, sha: ObjectID | RawObjectID) -> str: # Check from object dir return hex_to_filename(os.fspath(self.path), sha) def _iter_loose_objects(self) -> Iterator[ObjectID]: for base in os.listdir(self.path): if len(base) != 2: continue for rest in os.listdir(os.path.join(self.path, base)): sha = os.fsencode(base + rest) if not valid_hexsha(sha): continue yield ObjectID(sha) def count_loose_objects(self) -> int: """Count the number of loose objects in the object store. Returns: Number of loose objects """ # Calculate expected filename length for loose # objects (excluding directory) fn_length = self.object_format.hex_length - 2 count = 0 if not os.path.exists(self.path): return 0 for i in range(256): subdir = os.path.join(self.path, f"{i:02x}") try: count += len( [name for name in os.listdir(subdir) if len(name) == fn_length] ) except FileNotFoundError: # Directory may have been removed or is inaccessible continue return count def _get_loose_object(self, sha: ObjectID | RawObjectID) -> ShaFile | None: path = self._get_shafile_path(sha) try: # Load the object from path with SHA and hash algorithm from object store # Convert to hex ObjectID if needed if len(sha) == self.object_format.oid_length: hex_sha: ObjectID = sha_to_hex(RawObjectID(sha)) else: hex_sha = ObjectID(sha) return ShaFile.from_path(path, hex_sha, object_format=self.object_format) except FileNotFoundError: return None def delete_loose_object(self, sha: ObjectID) -> None: """Delete a loose object from disk. Args: sha: SHA1 of the object to delete Raises: FileNotFoundError: If the object file doesn't exist """ os.remove(self._get_shafile_path(sha)) def get_object_mtime(self, sha: ObjectID) -> float: """Get the modification time of an object. Args: sha: SHA1 of the object Returns: Modification time as seconds since epoch Raises: KeyError: if the object is not found """ # First check if it's a loose object if self.contains_loose(sha): path = self._get_shafile_path(sha) try: return os.path.getmtime(path) except FileNotFoundError: pass # Check if it's in a pack file for pack in self.packs: try: if sha in pack: # Use the pack file's mtime for packed objects pack_path = pack._data_path try: return os.path.getmtime(pack_path) except (FileNotFoundError, AttributeError): pass except PackFileDisappeared: pass raise KeyError(sha) def _remove_pack(self, pack: Pack) -> None: try: del self._pack_cache[os.path.basename(pack._basename)] except KeyError: pass # Store paths before closing to avoid re-opening files on Windows data_path = pack._data_path idx_path = pack._idx_path pack.close() os.remove(data_path) if os.path.exists(idx_path): os.remove(idx_path) def _get_pack_basepath( self, entries: Iterable[tuple[bytes, int, int | None]] ) -> str: suffix_bytes = iter_sha1(entry[0] for entry in entries) # TODO: Handle self.pack_dir being bytes suffix = suffix_bytes.decode("ascii") return os.path.join(self.pack_dir, "pack-" + suffix) def _complete_pack( self, f: BinaryIO, path: str, num_objects: int, indexer: PackIndexer, progress: Callable[..., None] | None = None, refs: dict[Ref, ObjectID] | None = None, ) -> Pack: """Move a specific file containing a pack into the pack directory. Note: The file should be on the same file system as the packs directory. Args: f: Open file object for the pack. path: Path to the pack file. num_objects: Number of objects in the pack. indexer: A PackIndexer for indexing the pack. progress: Optional progress reporting function. refs: Optional dictionary of refs for bitmap generation. """ entries = [] for i, entry in enumerate(indexer): if progress is not None: progress(f"generating index: {i}/{num_objects}\r".encode("ascii")) entries.append(entry) pack_sha, extra_entries = extend_pack( f, set(indexer.ext_refs()), get_raw=self.get_raw, compression_level=self.pack_compression_level, progress=progress, object_format=self.object_format, ) f.flush() if self.fsync_object_files: try: fileno = f.fileno() except AttributeError as e: raise OSError("fsync requested but file has no fileno()") from e else: os.fsync(fileno) f.close() entries.extend(extra_entries) # Move the pack in. entries.sort() pack_base_name = self._get_pack_basepath(entries) for pack in self.packs: if pack._basename == pack_base_name: return pack target_pack_path = pack_base_name + ".pack" target_index_path = pack_base_name + ".idx" if sys.platform == "win32": # Windows might have the target pack file lingering. Attempt # removal, silently passing if the target does not exist. with suppress(FileNotFoundError): os.remove(target_pack_path) os.rename(path, target_pack_path) # Write the index. mask = self.file_mode if self.file_mode is not None else PACK_MODE with GitFile( target_index_path, "wb", mask=mask, fsync=self.fsync_object_files, ) as index_file: write_pack_index( index_file, entries, pack_sha, version=self.pack_index_version ) # Generate bitmap if configured and refs are available if self.pack_write_bitmaps and refs: from .bitmap import generate_bitmap, write_bitmap from .pack import load_pack_index_file if progress: progress("Generating bitmap index\r".encode("ascii")) # Load the index we just wrote with open(target_index_path, "rb") as idx_file: pack_index = load_pack_index_file( os.path.basename(target_index_path), idx_file, self.object_format, ) # Generate the bitmap bitmap = generate_bitmap( pack_index=pack_index, object_store=self, refs=refs, pack_checksum=pack_sha, include_hash_cache=self.pack_write_bitmap_hash_cache, include_lookup_table=self.pack_write_bitmap_lookup_table, progress=lambda msg: progress(msg.encode("ascii")) if progress and isinstance(msg, str) else None, ) # Write the bitmap target_bitmap_path = pack_base_name + ".bitmap" write_bitmap(target_bitmap_path, bitmap) if progress: progress("Bitmap index written\r".encode("ascii")) # Add the pack to the store and return it. final_pack = Pack( pack_base_name, object_format=self.object_format, delta_window_size=self.pack_delta_window_size, window_memory=self.pack_window_memory, delta_cache_size=self.pack_delta_cache_size, depth=self.pack_depth, threads=self.pack_threads, big_file_threshold=self.pack_big_file_threshold, ) final_pack.check_length_and_checksum() # Extract just the hash from pack_base_name (/path/to/pack-HASH -> HASH) pack_hash = os.path.basename(pack_base_name)[len("pack-") :] self._add_cached_pack(pack_hash, final_pack) return final_pack def add_thin_pack( self, read_all: Callable[[int], bytes], read_some: Callable[[int], bytes] | None, progress: Callable[..., None] | None = None, ) -> "Pack": """Add a new thin pack to this object store. Thin packs are packs that contain deltas with parents that exist outside the pack. They should never be placed in the object store directly, and always indexed and completed as they are copied. Args: read_all: Read function that blocks until the number of requested bytes are read. read_some: Read function that returns at least one byte, but may not return the number of bytes requested. progress: Optional progress reporting function. Returns: A Pack object pointing at the now-completed thin pack in the objects/pack directory. """ import tempfile fd, path = tempfile.mkstemp(dir=self.path, prefix="tmp_pack_") with os.fdopen(fd, "w+b") as f: os.chmod(path, PACK_MODE) indexer = PackIndexer( f, self.object_format.hash_func, resolve_ext_ref=self.get_raw, # type: ignore[arg-type] ) copier = PackStreamCopier( self.object_format.hash_func, read_all, read_some, f, delta_iter=indexer, # type: ignore[arg-type] ) copier.verify(progress=progress) return self._complete_pack(f, path, len(copier), indexer, progress=progress) def add_pack( self, ) -> tuple[BinaryIO, Callable[[], None], Callable[[], None]]: """Add a new pack to this object store. Returns: Fileobject to write to, a commit function to call when the pack is finished and an abort function. """ import tempfile fd, path = tempfile.mkstemp(dir=self.pack_dir, suffix=".pack") f = os.fdopen(fd, "w+b") mask = self.file_mode if self.file_mode is not None else PACK_MODE os.chmod(path, mask) def commit() -> "Pack | None": if f.tell() > 0: f.seek(0) with PackData(path, file=f, object_format=self.object_format) as pd: indexer = PackIndexer.for_pack_data( pd, resolve_ext_ref=self.get_raw, # type: ignore[arg-type] ) return self._complete_pack(f, path, len(pd), indexer) # type: ignore[arg-type] else: f.close() os.remove(path) return None def abort() -> None: f.close() os.remove(path) return f, commit, abort # type: ignore[return-value] def add_object(self, obj: ShaFile) -> None: """Add a single object to this object store. Args: obj: Object to add """ # Use the correct hash algorithm for the object ID obj_id = ObjectID(obj.get_id(self.object_format)) path = self._get_shafile_path(obj_id) dir = os.path.dirname(path) try: os.mkdir(dir) if self.dir_mode is not None: os.chmod(dir, self.dir_mode) except FileExistsError: pass if os.path.exists(path): return # Already there, no need to write again mask = self.file_mode if self.file_mode is not None else PACK_MODE with GitFile(path, "wb", mask=mask, fsync=self.fsync_object_files) as f: f.write( obj.as_legacy_object(compression_level=self.loose_compression_level) ) @classmethod def init( cls, path: str | os.PathLike[str], *, file_mode: int | None = None, dir_mode: int | None = None, object_format: "ObjectFormat | None" = None, ) -> "DiskObjectStore": """Initialize a new disk object store. Creates the necessary directory structure for a Git object store. Args: path: Path where the object store should be created file_mode: Optional file permission mask for shared repository dir_mode: Optional directory permission mask for shared repository object_format: Hash algorithm to use (SHA1 or SHA256) Returns: New DiskObjectStore instance """ try: os.mkdir(path) if dir_mode is not None: os.chmod(path, dir_mode) except FileExistsError: pass info_path = os.path.join(path, "info") pack_path = os.path.join(path, PACKDIR) os.mkdir(info_path) os.mkdir(pack_path) if dir_mode is not None: os.chmod(info_path, dir_mode) os.chmod(pack_path, dir_mode) return cls( path, file_mode=file_mode, dir_mode=dir_mode, object_format=object_format ) def iter_prefix(self, prefix: bytes) -> Iterator[ObjectID]: """Iterate over all object SHAs with the given prefix. Args: prefix: Hex prefix to search for (as bytes) Returns: Iterator of object SHAs (as ObjectID) matching the prefix """ if len(prefix) < 2: yield from super().iter_prefix(prefix) return seen = set() dir = prefix[:2].decode() rest = prefix[2:].decode() try: for name in os.listdir(os.path.join(self.path, dir)): if name.startswith(rest): sha = ObjectID(os.fsencode(dir + name)) if sha not in seen: seen.add(sha) yield sha except FileNotFoundError: pass for p in self.packs: bin_prefix = ( binascii.unhexlify(prefix) if len(prefix) % 2 == 0 else binascii.unhexlify(prefix[:-1]) ) for bin_sha in p.index.iter_prefix(bin_prefix): sha = sha_to_hex(bin_sha) if sha.startswith(prefix) and sha not in seen: seen.add(sha) yield sha for alternate in self.alternates: for sha in alternate.iter_prefix(prefix): if sha not in seen: seen.add(sha) yield sha def get_commit_graph(self) -> "CommitGraph | None": """Get the commit graph for this object store. Returns: CommitGraph object if available, None otherwise """ if not self._use_commit_graph: return None if self._commit_graph is None: from .commit_graph import read_commit_graph # Look for commit graph in our objects directory graph_file = os.path.join(self.path, "info", "commit-graph") if os.path.exists(graph_file): self._commit_graph = read_commit_graph(graph_file) return self._commit_graph def get_midx(self) -> MultiPackIndex | None: """Get the multi-pack-index for this object store. Returns: MultiPackIndex object if available, None otherwise Raises: ValueError: If MIDX file is corrupt OSError: If MIDX file cannot be read """ if not self._use_midx: return None if self._midx is None: # Look for MIDX in pack directory midx_file = os.path.join(self.pack_dir, "multi-pack-index") if os.path.exists(midx_file): self._midx = load_midx(midx_file) return self._midx def _get_pack_by_name(self, pack_name: str) -> Pack: """Get a pack by its base name. Args: pack_name: Base name of the pack (e.g., 'pack-abc123.pack' or 'pack-abc123.idx') Returns: Pack object Raises: KeyError: If pack doesn't exist """ # Remove .pack or .idx extension if present if pack_name.endswith(".pack"): base_name = pack_name[:-5] elif pack_name.endswith(".idx"): base_name = pack_name[:-4] else: base_name = pack_name # Check if already in cache if base_name in self._pack_cache: return self._pack_cache[base_name] # Load the pack pack_path = os.path.join(self.pack_dir, base_name) if not os.path.exists(pack_path + ".pack"): raise KeyError(f"Pack {pack_name} not found") pack = Pack( pack_path, object_format=self.object_format, delta_window_size=self.pack_delta_window_size, window_memory=self.pack_window_memory, delta_cache_size=self.pack_delta_cache_size, depth=self.pack_depth, threads=self.pack_threads, big_file_threshold=self.pack_big_file_threshold, ) self._pack_cache[base_name] = pack return pack def contains_packed(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is packed. This checks the MIDX first if available, then falls back to checking individual pack indexes. Args: sha: Binary SHA of the object Returns: True if the object is in a pack file """ # Check MIDX first for faster lookup midx = self.get_midx() if midx is not None and sha in midx: return True # Fall back to checking individual packs return super().contains_packed(sha) def get_raw(self, name: RawObjectID | ObjectID) -> tuple[int, bytes]: """Obtain the raw fulltext for an object. This uses the MIDX if available for faster lookups. Args: name: SHA for the object (20 bytes binary or 40 bytes hex) Returns: Tuple with numeric type and object contents Raises: KeyError: If object not found """ sha: RawObjectID if len(name) in (40, 64): # name is ObjectID (hex), convert to RawObjectID # Support both SHA1 (40) and SHA256 (64) sha = hex_to_sha(cast(ObjectID, name)) elif len(name) in (20, 32): # name is already RawObjectID (binary) # Support both SHA1 (20) and SHA256 (32) sha = RawObjectID(name) else: raise AssertionError(f"Invalid object name {name!r}") # Try MIDX first for faster lookup midx = self.get_midx() if midx is not None: result = midx.object_offset(sha) if result is not None: pack_name, _offset = result try: pack = self._get_pack_by_name(pack_name) return pack.get_raw(sha) except (KeyError, PackFileDisappeared): # Pack disappeared or object not found, fall through to standard lookup pass # Fall back to the standard implementation return super().get_raw(name) def write_midx(self) -> bytes: """Write a multi-pack-index file for this object store. Creates a MIDX file that indexes all pack files in the pack directory. Returns: SHA-1 checksum of the written MIDX file Raises: OSError: If the pack directory doesn't exist or MIDX can't be written """ from .midx import write_midx_file # Get all pack files packs = self.packs if not packs: # No packs to index return b"\x00" * 20 # Collect entries from all packs pack_entries: list[tuple[str, list[tuple[RawObjectID, int, int | None]]]] = [] for pack in packs: # Git stores .idx extension in MIDX, not .pack pack_name = os.path.basename(pack._basename) + ".idx" entries = list(pack.index.iterentries()) pack_entries.append((pack_name, entries)) # Write MIDX file midx_path = os.path.join(self.pack_dir, "multi-pack-index") return write_midx_file(midx_path, pack_entries) def write_commit_graph( self, refs: Iterable[ObjectID] | None = None, reachable: bool = True ) -> None: """Write a commit graph file for this object store. Args: refs: List of refs to include. If None, includes all refs from object store. reachable: If True, includes all commits reachable from refs. If False, only includes the direct ref targets. """ from .commit_graph import get_reachable_commits if refs is None: # Get all commit objects from the object store all_refs = [] # Iterate through all objects to find commits for sha in self: try: obj = self[sha] if obj.type_name == b"commit": all_refs.append(sha) except KeyError: continue else: # Use provided refs all_refs = list(refs) if not all_refs: return # No commits to include if reachable: # Get all reachable commits commit_ids = get_reachable_commits(self, all_refs) else: # Just use the direct ref targets - ensure they're hex ObjectIDs commit_ids = [] for ref in all_refs: if isinstance(ref, bytes) and len(ref) == self.object_format.hex_length: # Already hex ObjectID commit_ids.append(ref) elif ( isinstance(ref, bytes) and len(ref) == self.object_format.oid_length ): # Binary SHA, convert to hex ObjectID from .objects import sha_to_hex commit_ids.append(sha_to_hex(RawObjectID(ref))) else: # Assume it's already correct format commit_ids.append(ref) if commit_ids: # Write commit graph directly to our object store path # Generate the commit graph from .commit_graph import generate_commit_graph graph = generate_commit_graph(self, commit_ids) if graph.entries: # Ensure the info directory exists info_dir = os.path.join(self.path, "info") os.makedirs(info_dir, exist_ok=True) if self.dir_mode is not None: os.chmod(info_dir, self.dir_mode) # Write using GitFile for atomic operation graph_path = os.path.join(info_dir, "commit-graph") mask = self.file_mode if self.file_mode is not None else 0o644 with GitFile(graph_path, "wb", mask=mask) as f: assert isinstance( f, _GitFile ) # GitFile in write mode always returns _GitFile graph.write_to_file(f) # Clear cached commit graph so it gets reloaded self._commit_graph = None def prune(self, grace_period: int | None = None) -> None: """Prune/clean up this object store. This removes temporary files that were left behind by interrupted pack operations. These are files that start with ``tmp_pack_`` in the repository directory or files with .pack extension but no corresponding .idx file in the pack directory. Args: grace_period: Grace period in seconds for removing temporary files. If None, uses DEFAULT_TEMPFILE_GRACE_PERIOD. """ import glob if grace_period is None: grace_period = DEFAULT_TEMPFILE_GRACE_PERIOD # Clean up tmp_pack_* files in the repository directory for tmp_file in glob.glob(os.path.join(self.path, "tmp_pack_*")): # Check if file is old enough (more than grace period) mtime = os.path.getmtime(tmp_file) if time.time() - mtime > grace_period: os.remove(tmp_file) # Clean up orphaned .pack files without corresponding .idx files try: pack_dir_contents = os.listdir(self.pack_dir) except FileNotFoundError: return pack_files = {} idx_files = set() for name in pack_dir_contents: if name.endswith(".pack"): base_name = name[:-5] # Remove .pack extension pack_files[base_name] = name elif name.endswith(".idx"): base_name = name[:-4] # Remove .idx extension idx_files.add(base_name) # Remove .pack files without corresponding .idx files for base_name, pack_name in pack_files.items(): if base_name not in idx_files: pack_path = os.path.join(self.pack_dir, pack_name) # Check if file is old enough (more than grace period) mtime = os.path.getmtime(pack_path) if time.time() - mtime > grace_period: os.remove(pack_path) def close(self) -> None: """Close the object store and release resources. This method closes all cached pack files, MIDX, and frees associated resources. Can be called multiple times safely. """ # Close MIDX if it's loaded if self._midx is not None: self._midx.close() self._midx = None # Close alternates if self._alternates is not None: for alt in self._alternates: alt.close() self._alternates = None # Call parent class close to handle pack files super().close() class MemoryObjectStore(PackCapableObjectStore): """Object store that keeps all objects in memory.""" def __init__(self, *, object_format: "ObjectFormat | None" = None) -> None: """Initialize a MemoryObjectStore. Creates an empty in-memory object store. Args: object_format: Hash algorithm to use (defaults to SHA1) """ super().__init__(object_format=object_format) self._data: dict[ObjectID, ShaFile] = {} self.pack_compression_level = -1 def _to_hexsha(self, sha: ObjectID | RawObjectID) -> ObjectID: if len(sha) == self.object_format.hex_length: return cast(ObjectID, sha) elif len(sha) == self.object_format.oid_length: return sha_to_hex(cast(RawObjectID, sha)) else: raise ValueError(f"Invalid sha {sha!r}") def contains_loose(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is loose.""" return self._to_hexsha(sha) in self._data def contains_packed(self, sha: ObjectID | RawObjectID) -> bool: """Check if a particular object is present by SHA1 and is packed.""" return False def __iter__(self) -> Iterator[ObjectID]: """Iterate over the SHAs that are present in this store.""" return iter(self._data.keys()) @property def packs(self) -> list[Pack]: """List with pack objects.""" return [] def get_raw(self, name: RawObjectID | ObjectID) -> tuple[int, bytes]: """Obtain the raw text for an object. Args: name: sha for the object. Returns: tuple with numeric type and object contents. """ obj = self[self._to_hexsha(name)] return obj.type_num, obj.as_raw_string() def __getitem__(self, name: ObjectID | RawObjectID) -> ShaFile: """Retrieve an object by SHA. Args: name: SHA of the object (as hex string or bytes) Returns: Copy of the ShaFile object Raises: KeyError: If the object is not found """ return self._data[self._to_hexsha(name)].copy() def __delitem__(self, name: ObjectID) -> None: """Delete an object from this store, for testing only.""" del self._data[self._to_hexsha(name)] def add_object(self, obj: ShaFile) -> None: """Add a single object to this object store.""" self._data[obj.id] = obj.copy() def add_objects( self, objects: Iterable[tuple[ShaFile, str | None]], progress: Callable[[str], None] | None = None, ) -> None: """Add a set of objects to this object store. Args: objects: Iterable over a list of (object, path) tuples progress: Optional progress reporting function. """ for obj, path in objects: self.add_object(obj) def add_pack(self) -> tuple[BinaryIO, Callable[[], None], Callable[[], None]]: """Add a new pack to this object store. Because this object store doesn't support packs, we extract and add the individual objects. Returns: Fileobject to write to and a commit function to call when the pack is finished. """ from tempfile import SpooledTemporaryFile f = SpooledTemporaryFile(max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-") def commit() -> None: size = f.tell() if size > 0: f.seek(0) p = PackData.from_file(f, self.object_format, size) for obj in PackInflater.for_pack_data(p, self.get_raw): # type: ignore[arg-type] self.add_object(obj) p.close() f.close() else: f.close() def abort() -> None: f.close() return f, commit, abort # type: ignore[return-value] def add_pack_data( self, count: int, unpacked_objects: Iterator[UnpackedObject], progress: Callable[[str], None] | None = None, ) -> None: """Add pack data to this object store. Args: count: Number of items to add unpacked_objects: Iterator of UnpackedObject instances progress: Optional progress reporting function. """ if count == 0: return # Since MemoryObjectStore doesn't support pack files, we need to # extract individual objects. To handle deltas properly, we write # to a temporary pack and then use PackInflater to resolve them. f, commit, abort = self.add_pack() try: write_pack_data( f.write, unpacked_objects, num_records=count, progress=progress, object_format=self.object_format, ) except BaseException: abort() raise else: commit() def add_thin_pack( self, read_all: Callable[[int], bytes], read_some: Callable[[int], bytes] | None, progress: Callable[[str], None] | None = None, ) -> None: """Add a new thin pack to this object store. Thin packs are packs that contain deltas with parents that exist outside the pack. Because this object store doesn't support packs, we extract and add the individual objects. Args: read_all: Read function that blocks until the number of requested bytes are read. read_some: Read function that returns at least one byte, but may not return the number of bytes requested. progress: Optional progress reporting function. """ f, commit, abort = self.add_pack() try: copier = PackStreamCopier( self.object_format.hash_func, read_all, read_some, f, ) copier.verify() except BaseException: abort() raise else: commit() class ObjectIterator(Protocol): """Interface for iterating over objects.""" def iterobjects(self) -> Iterator[ShaFile]: """Iterate over all objects. Returns: Iterator of ShaFile objects """ raise NotImplementedError(self.iterobjects) def tree_lookup_path( lookup_obj: Callable[[ObjectID | RawObjectID], ShaFile], root_sha: ObjectID | RawObjectID, path: bytes, ) -> tuple[int, ObjectID]: """Look up an object in a Git tree. Args: lookup_obj: Callback for retrieving object by SHA1 root_sha: SHA1 of the root tree path: Path to lookup Returns: A tuple of (mode, SHA) of the resulting path. """ tree = lookup_obj(root_sha) if not isinstance(tree, Tree): raise NotTreeError(root_sha) return tree.lookup_path(lookup_obj, path) def _collect_filetree_revs( obj_store: ObjectContainer, tree_sha: ObjectID, kset: set[ObjectID] ) -> None: """Collect SHA1s of files and directories for specified tree. Args: obj_store: Object store to get objects by SHA from tree_sha: tree reference to walk kset: set to fill with references to files and directories """ filetree = obj_store[tree_sha] assert isinstance(filetree, Tree) for name, mode, sha in filetree.iteritems(): assert mode is not None assert sha is not None if not S_ISGITLINK(mode) and sha not in kset: kset.add(sha) if stat.S_ISDIR(mode): _collect_filetree_revs(obj_store, sha, kset) def _split_commits_and_tags( obj_store: ObjectContainer, lst: Iterable[ObjectID], *, unknown: str = "error", ) -> tuple[set[ObjectID], set[ObjectID], set[ObjectID]]: """Split object id list into three lists with commit, tag, and other SHAs. Commits referenced by tags are included into commits list as well. Only SHA1s known in this repository will get through, controlled by the unknown parameter. Args: obj_store: Object store to get objects by SHA1 from lst: Collection of commit and tag SHAs unknown: How to handle unknown objects: "error", "warn", or "ignore" Returns: A tuple of (commits, tags, others) SHA1s """ import logging if unknown not in ("error", "warn", "ignore"): raise ValueError( f"unknown must be 'error', 'warn', or 'ignore', got {unknown!r}" ) commits: set[ObjectID] = set() tags: set[ObjectID] = set() others: set[ObjectID] = set() for e in lst: try: o = obj_store[e] except KeyError: if unknown == "error": raise elif unknown == "warn": logging.warning( "Object %s not found in object store", e.decode("ascii") ) # else: ignore else: if isinstance(o, Commit): commits.add(e) elif isinstance(o, Tag): tags.add(e) tagged = o.object[1] c, t, os = _split_commits_and_tags(obj_store, [tagged], unknown=unknown) commits |= c tags |= t others |= os else: others.add(e) return (commits, tags, others) class MissingObjectFinder: """Find the objects missing from another object store. Args: object_store: Object store containing at least all objects to be sent haves: SHA1s of commits not to send (already present in target) wants: SHA1s of commits to send progress: Optional function to report progress to. get_tagged: Function that returns a dict of pointed-to sha -> tag sha for including tags. get_parents: Optional function for getting the parents of a commit. """ def __init__( self, object_store: BaseObjectStore, haves: Iterable[ObjectID], wants: Iterable[ObjectID], *, shallow: Set[ObjectID] | None = None, progress: Callable[[bytes], None] | None = None, get_tagged: Callable[[], dict[ObjectID, ObjectID]] | None = None, get_parents: Callable[[Commit], list[ObjectID]] = lambda commit: commit.parents, ) -> None: """Initialize a MissingObjectFinder. Args: object_store: Object store containing objects haves: SHA1s of objects already present in target wants: SHA1s of objects to send shallow: Set of shallow commit SHA1s progress: Optional progress reporting callback get_tagged: Function returning dict of pointed-to sha -> tag sha get_parents: Function for getting commit parents """ self.object_store = object_store if shallow is None: shallow = set() self._get_parents = get_parents reachability = object_store.get_reachability_provider() # process Commits and Tags differently # haves may list commits/tags not available locally (silently ignore them). # wants should only contain valid SHAs (fail fast if not). have_commits, have_tags, have_others = _split_commits_and_tags( object_store, haves, unknown="ignore" ) want_commits, want_tags, want_others = _split_commits_and_tags( object_store, wants, unknown="error" ) # all_ancestors is a set of commits that shall not be sent # (complete repository up to 'haves') all_ancestors = reachability.get_reachable_commits( have_commits, exclude=None, shallow=shallow ) # all_missing - complete set of commits between haves and wants # common_commits - boundary commits directly encountered when traversing wants # We use _collect_ancestors here because we need the exact boundary behavior: # commits that are in all_ancestors and directly reachable from wants, # but we don't traverse past them. This is hard to express with the # reachability abstraction alone. missing_commits, common_commits = _collect_ancestors( object_store, want_commits, frozenset(all_ancestors), shallow=frozenset(shallow), get_parents=self._get_parents, ) self.remote_has: set[ObjectID] = set() # Now, fill sha_done with commits and revisions of # files and directories known to be both locally # and on target. Thus these commits and files # won't get selected for fetch for h in common_commits: self.remote_has.add(h) cmt = object_store[h] assert isinstance(cmt, Commit) # Get tree objects for this commit tree_objects = reachability.get_tree_objects([cmt.tree]) self.remote_has.update(tree_objects) # record tags we have as visited, too for t in have_tags: self.remote_has.add(t) self.sha_done = set(self.remote_has) # in fact, what we 'want' is commits, tags, and others # we've found missing self.objects_to_send: set[tuple[ObjectID, bytes | None, int | None, bool]] = { (w, None, Commit.type_num, False) for w in missing_commits } missing_tags = want_tags.difference(have_tags) self.objects_to_send.update( {(w, None, Tag.type_num, False) for w in missing_tags} ) missing_others = want_others.difference(have_others) self.objects_to_send.update({(w, None, None, False) for w in missing_others}) if progress is None: self.progress: Callable[[bytes], None] = lambda x: None else: self.progress = progress self._tagged = (get_tagged and get_tagged()) or {} def get_remote_has(self) -> set[ObjectID]: """Get the set of SHAs the remote has. Returns: Set of SHA1s that the remote side already has """ return self.remote_has def add_todo( self, entries: Iterable[tuple[ObjectID, bytes | None, int | None, bool]] ) -> None: """Add objects to the todo list. Args: entries: Iterable of tuples (sha, name, type_num, is_leaf) """ self.objects_to_send.update([e for e in entries if e[0] not in self.sha_done]) def __next__(self) -> tuple[ObjectID, PackHint | None]: """Get the next object to send. Returns: Tuple of (sha, pack_hint) Raises: StopIteration: When no more objects to send """ while True: if not self.objects_to_send: self.progress( f"counting objects: {len(self.sha_done)}, done.\n".encode("ascii") ) raise StopIteration (sha, name, type_num, leaf) = self.objects_to_send.pop() if sha not in self.sha_done: break if not leaf: o = self.object_store[sha] if isinstance(o, Commit): self.add_todo([(o.tree, b"", Tree.type_num, False)]) elif isinstance(o, Tree): todos = [] for n, m, s in o.iteritems(): assert m is not None assert n is not None assert s is not None if not S_ISGITLINK(m): todos.append( ( s, n, (Blob.type_num if stat.S_ISREG(m) else Tree.type_num), not stat.S_ISDIR(m), ) ) self.add_todo(todos) elif isinstance(o, Tag): self.add_todo([(o.object[1], None, o.object[0].type_num, False)]) if sha in self._tagged: self.add_todo([(self._tagged[sha], None, None, True)]) self.sha_done.add(sha) if len(self.sha_done) % 1000 == 0: self.progress(f"counting objects: {len(self.sha_done)}\r".encode("ascii")) if type_num is None: pack_hint = None else: pack_hint = (type_num, name) return (sha, pack_hint) def __iter__(self) -> Iterator[tuple[ObjectID, PackHint | None]]: """Return iterator over objects to send. Returns: Self (this class implements the iterator protocol) """ return self class ObjectStoreGraphWalker: """Graph walker that finds what commits are missing from an object store.""" heads: set[ObjectID] """Revisions without descendants in the local repo.""" get_parents: Callable[[ObjectID], list[ObjectID]] """Function to retrieve parents in the local repo.""" shallow: set[ObjectID] def __init__( self, local_heads: Iterable[ObjectID], get_parents: Callable[[ObjectID], list[ObjectID]], shallow: set[ObjectID] | None = None, update_shallow: Callable[[set[ObjectID] | None, set[ObjectID] | None], None] | None = None, ) -> None: """Create a new instance. Args: local_heads: Heads to start search with get_parents: Function for finding the parents of a SHA1. shallow: Set of shallow commits. update_shallow: Function to update shallow commits. """ self.heads = set(local_heads) self.get_parents = get_parents self.parents: dict[ObjectID, list[ObjectID] | None] = {} if shallow is None: shallow = set() self.shallow = shallow self.update_shallow = update_shallow def nak(self) -> None: """Nothing in common was found.""" def ack(self, sha: ObjectID) -> None: """Ack that a revision and its ancestors are present in the source.""" if len(sha) != 40: # TODO: support SHA256 raise ValueError(f"unexpected sha {sha!r} received") ancestors = {sha} # stop if we run out of heads to remove while self.heads: for a in ancestors: if a in self.heads: self.heads.remove(a) # collect all ancestors new_ancestors = set() for a in ancestors: ps = self.parents.get(a) if ps is not None: new_ancestors.update(ps) self.parents[a] = None # no more ancestors; stop if not new_ancestors: break ancestors = new_ancestors def next(self) -> ObjectID | None: """Iterate over ancestors of heads in the target.""" if self.heads: ret = self.heads.pop() try: ps = self.get_parents(ret) except KeyError: return None self.parents[ret] = ps self.heads.update([p for p in ps if p not in self.parents]) return ret return None __next__ = next def commit_tree_changes( object_store: BaseObjectStore, tree: ObjectID | Tree, changes: Sequence[tuple[bytes, int | None, ObjectID | None]], ) -> ObjectID: """Commit a specified set of changes to a tree structure. This will apply a set of changes on top of an existing tree, storing new objects in object_store. changes are a list of tuples with (path, mode, object_sha). Paths can be both blobs and trees. See the mode and object sha to None deletes the path. This method works especially well if there are only a small number of changes to a big tree. For a large number of changes to a large tree, use e.g. commit_tree. Args: object_store: Object store to store new objects in and retrieve old ones from. tree: Original tree root (SHA or Tree object) changes: changes to apply Returns: New tree root object """ # TODO(jelmer): Save up the objects and add them using .add_objects # rather than with individual calls to .add_object. # Handle both Tree object and SHA if isinstance(tree, Tree): tree_obj: Tree = tree else: sha_obj = object_store[tree] assert isinstance(sha_obj, Tree) tree_obj = sha_obj nested_changes: dict[bytes, list[tuple[bytes, int | None, ObjectID | None]]] = {} for path, new_mode, new_sha in changes: try: (dirname, subpath) = path.split(b"/", 1) except ValueError: if new_sha is None: del tree_obj[path] else: assert new_mode is not None tree_obj[path] = (new_mode, new_sha) else: nested_changes.setdefault(dirname, []).append((subpath, new_mode, new_sha)) for name, subchanges in nested_changes.items(): try: orig_subtree_id: ObjectID | Tree = tree_obj[name][1] except KeyError: # For new directories, pass an empty Tree object orig_subtree_id = Tree() subtree_id = commit_tree_changes(object_store, orig_subtree_id, subchanges) subtree = object_store[subtree_id] assert isinstance(subtree, Tree) if len(subtree) == 0: del tree_obj[name] else: tree_obj[name] = (stat.S_IFDIR, subtree.id) object_store.add_object(tree_obj) return tree_obj.id class OverlayObjectStore(BaseObjectStore): """Object store that can overlay multiple object stores.""" def __init__( self, bases: list[BaseObjectStore], add_store: BaseObjectStore | None = None, ) -> None: """Initialize an OverlayObjectStore. Args: bases: List of base object stores to overlay add_store: Optional store to write new objects to Raises: ValueError: If stores have different hash algorithms """ from .object_format import verify_same_object_format # Verify all stores use the same hash algorithm store_algorithms = [store.object_format for store in bases] if add_store: store_algorithms.append(add_store.object_format) object_format = verify_same_object_format(*store_algorithms) super().__init__(object_format=object_format) self.bases = bases self.add_store = add_store def add_object(self, object: ShaFile) -> None: """Add a single object to the store. Args: object: Object to add Raises: NotImplementedError: If no add_store was provided """ if self.add_store is None: raise NotImplementedError(self.add_object) return self.add_store.add_object(object) def add_objects( self, objects: Sequence[tuple[ShaFile, str | None]], progress: Callable[[str], None] | None = None, ) -> Pack | None: """Add multiple objects to the store. Args: objects: Iterator of objects to add progress: Optional progress reporting callback Raises: NotImplementedError: If no add_store was provided """ if self.add_store is None: raise NotImplementedError(self.add_object) return self.add_store.add_objects(objects, progress) @property def packs(self) -> list[Pack]: """Get the list of packs from all overlaid stores. Returns: Combined list of packs from all base stores """ ret = [] for b in self.bases: ret.extend(b.packs) return ret def __iter__(self) -> Iterator[ObjectID]: """Iterate over all object SHAs in the overlaid stores. Returns: Iterator of object SHAs (deduped across stores) """ done = set() for b in self.bases: for o_id in b: if o_id not in done: yield o_id done.add(o_id) def iterobjects_subset( self, shas: Iterable[ObjectID], *, allow_missing: bool = False ) -> Iterator[ShaFile]: """Iterate over a subset of objects from the overlaid stores. Args: shas: Iterable of object SHAs to retrieve allow_missing: If True, skip missing objects; if False, raise KeyError Returns: Iterator of ShaFile objects Raises: KeyError: If an object is missing and allow_missing is False """ todo = set(shas) found: set[ObjectID] = set() for b in self.bases: # Create a copy of todo for each base to avoid modifying # the set while iterating through it current_todo = todo - found for o in b.iterobjects_subset(current_todo, allow_missing=True): yield o found.add(o.id) # Check for any remaining objects not found missing = todo - found if missing and not allow_missing: raise KeyError(next(iter(missing))) def iter_unpacked_subset( self, shas: Iterable[ObjectID | RawObjectID], include_comp: bool = False, allow_missing: bool = False, convert_ofs_delta: bool = True, ) -> Iterator[UnpackedObject]: """Iterate over unpacked objects from the overlaid stores. Args: shas: Iterable of object SHAs to retrieve include_comp: Whether to include compressed data allow_missing: If True, skip missing objects; if False, raise KeyError convert_ofs_delta: Whether to convert OFS_DELTA objects Returns: Iterator of unpacked objects Raises: KeyError: If an object is missing and allow_missing is False """ todo: set[ObjectID | RawObjectID] = set(shas) for b in self.bases: for o in b.iter_unpacked_subset( todo, include_comp=include_comp, allow_missing=True, convert_ofs_delta=convert_ofs_delta, ): yield o todo.remove(o.sha()) if todo and not allow_missing: raise KeyError(next(iter(todo))) def get_raw(self, sha_id: ObjectID | RawObjectID) -> tuple[int, bytes]: """Get the raw object data from the overlaid stores. Args: sha_id: SHA of the object Returns: Tuple of (type_num, raw_data) Raises: KeyError: If object not found in any base store """ for b in self.bases: try: return b.get_raw(sha_id) except KeyError: pass raise KeyError(sha_id) def contains_packed(self, sha: ObjectID | RawObjectID) -> bool: """Check if an object is packed in any base store. Args: sha: SHA of the object Returns: True if object is packed in any base store """ for b in self.bases: if b.contains_packed(sha): return True return False def contains_loose(self, sha: ObjectID | RawObjectID) -> bool: """Check if an object is loose in any base store. Args: sha: SHA of the object Returns: True if object is loose in any base store """ for b in self.bases: if b.contains_loose(sha): return True return False def read_packs_file(f: BinaryIO) -> Iterator[str]: """Yield the packs listed in a packs file.""" for line in f.read().splitlines(): if not line: continue (kind, name) = line.split(b" ", 1) if kind != b"P": continue yield os.fsdecode(name) class BucketBasedObjectStore(PackBasedObjectStore): """Object store implementation that uses a bucket store like S3 as backend.""" def _iter_loose_objects(self) -> Iterator[ObjectID]: """Iterate over the SHAs of all loose objects.""" return iter([]) def _get_loose_object(self, sha: ObjectID | RawObjectID) -> None: return None def delete_loose_object(self, sha: ObjectID) -> None: """Delete a loose object (no-op for bucket stores). Bucket-based stores don't have loose objects, so this is a no-op. Args: sha: SHA of the object to delete """ # Doesn't exist.. def pack_loose_objects(self, progress: Callable[[str], None] | None = None) -> int: """Pack loose objects. Returns number of objects packed. BucketBasedObjectStore doesn't support loose objects, so this is a no-op. Args: progress: Optional progress reporting callback (ignored) """ return 0 def _remove_pack_by_name(self, name: str) -> None: """Remove a pack by name. Subclasses should implement this.""" raise NotImplementedError(self._remove_pack_by_name) def _iter_pack_names(self) -> Iterator[str]: raise NotImplementedError(self._iter_pack_names) def _get_pack(self, name: str) -> Pack: raise NotImplementedError(self._get_pack) def _update_pack_cache(self) -> list[Pack]: pack_files = set(self._iter_pack_names()) # Open newly appeared pack files new_packs = [] for f in pack_files: if f not in self._pack_cache: pack = self._get_pack(f) new_packs.append(pack) self._pack_cache[f] = pack # Remove disappeared pack files for f in set(self._pack_cache) - pack_files: self._pack_cache.pop(f).close() return new_packs def _upload_pack( self, basename: str, pack_file: BinaryIO, index_file: BinaryIO ) -> None: raise NotImplementedError def add_pack(self) -> tuple[BinaryIO, Callable[[], None], Callable[[], None]]: """Add a new pack to this object store. Returns: Fileobject to write to, a commit function to call when the pack is finished and an abort function. """ import tempfile pf = tempfile.SpooledTemporaryFile( max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-" ) def commit() -> Pack | None: if pf.tell() == 0: pf.close() return None pf.seek(0) p = PackData(pf.name, file=pf, object_format=self.object_format) entries = p.sorted_entries() basename = iter_sha1(entry[0] for entry in entries).decode("ascii") idxf = tempfile.SpooledTemporaryFile( max_size=PACK_SPOOL_FILE_MAX_SIZE, prefix="incoming-" ) checksum = p.get_stored_checksum() write_pack_index(idxf, entries, checksum, version=self.pack_index_version) idxf.seek(0) idx = load_pack_index_file(basename + ".idx", idxf, self.object_format) for pack in self.packs: if pack.get_stored_checksum() == p.get_stored_checksum(): p.close() idx.close() pf.close() idxf.close() return pack pf.seek(0) idxf.seek(0) self._upload_pack(basename, pf, idxf) # type: ignore[arg-type] final_pack = Pack.from_objects(p, idx) self._add_cached_pack(basename, final_pack) pf.close() idxf.close() return final_pack return pf, commit, pf.close # type: ignore[return-value] def _collect_ancestors( store: ObjectContainer, heads: Iterable[ObjectID], common: frozenset[ObjectID] = frozenset(), shallow: frozenset[ObjectID] = frozenset(), get_parents: Callable[[Commit], list[ObjectID]] = lambda commit: commit.parents, ) -> tuple[set[ObjectID], set[ObjectID]]: """Collect all ancestors of heads up to (excluding) those in common. Args: store: Object store to get commits from heads: commits to start from common: commits to end at, or empty set to walk repository completely shallow: Set of shallow commits get_parents: Optional function for getting the parents of a commit. Returns: a tuple (A, B) where A - all commits reachable from heads but not present in common, B - common (shared) elements that are directly reachable from heads """ bases = set() commits = set() queue: list[ObjectID] = [] queue.extend(heads) # Try to use commit graph if available commit_graph = store.get_commit_graph() while queue: e = queue.pop(0) if e in common: bases.add(e) elif e not in commits: commits.add(e) if e in shallow: continue # Try to use commit graph for parent lookup parents = None if commit_graph: parents = commit_graph.get_parents(e) if parents is None: # Fall back to loading the object cmt = store[e] assert isinstance(cmt, Commit) parents = get_parents(cmt) queue.extend(parents) return (commits, bases) def iter_tree_contents( store: ObjectContainer, tree_id: ObjectID | None, *, include_trees: bool = False ) -> Iterator[TreeEntry]: """Iterate the contents of a tree and all subtrees. Iteration is depth-first pre-order, as in e.g. os.walk. Args: store: Object store to get trees from tree_id: SHA1 of the tree. include_trees: If True, include tree objects in the iteration. Yields: TreeEntry namedtuples for all the objects in a tree. """ if tree_id is None: return # This could be fairly easily generalized to >2 trees if we find a use # case. todo = [TreeEntry(b"", stat.S_IFDIR, tree_id)] while todo: entry = todo.pop() assert entry.mode is not None if stat.S_ISDIR(entry.mode): extra = [] assert entry.sha is not None tree = store[entry.sha] assert isinstance(tree, Tree) for subentry in tree.iteritems(name_order=True): assert entry.path is not None extra.append(subentry.in_path(entry.path)) todo.extend(reversed(extra)) if not stat.S_ISDIR(entry.mode) or include_trees: yield entry def iter_commit_contents( store: ObjectContainer, commit: Commit | ObjectID | RawObjectID, *, include: Sequence[str | bytes | Path] | None = None, ) -> Iterator[TreeEntry]: """Iterate the contents of the repository at the specified commit. This is a wrapper around iter_tree_contents() and tree_lookup_path() to simplify the common task of getting the contest of a repo at a particular commit. See also dulwich.index.build_file_from_blob() for writing individual files to disk. Args: store: Object store to get trees from commit: Commit object, or SHA1 of a commit include: if provided, only the entries whose paths are in the list, or whose parent tree is in the list, will be included. Note that duplicate or overlapping paths (e.g. ["foo", "foo/bar"]) may result in duplicate entries Yields: TreeEntry namedtuples for all matching files in a commit. """ sha = commit.id if isinstance(commit, Commit) else commit if not isinstance(obj := store[sha], Commit): raise TypeError( f"{sha.decode('ascii')} should be ID of a Commit, but is {type(obj)}" ) commit = obj encoding = commit.encoding or "utf-8" include_bytes: list[bytes] = ( [ path if isinstance(path, bytes) else str(path).encode(encoding) for path in include ] if include is not None else [b""] ) for path in include_bytes: mode, obj_id = tree_lookup_path(store.__getitem__, commit.tree, path) # Iterate all contained files if path points to a dir, otherwise just get that # single file if isinstance(store[obj_id], Tree): for entry in iter_tree_contents(store, obj_id): yield entry.in_path(path) else: yield TreeEntry(path, mode, obj_id) def peel_sha( store: ObjectContainer, sha: ObjectID | RawObjectID ) -> tuple[ShaFile, ShaFile]: """Peel all tags from a SHA. Args: store: Object store to get objects from sha: The object SHA to peel. Returns: The fully-peeled SHA1 of a tag object, after peeling all intermediate tags; if the original ref does not point to a tag, this will equal the original SHA1. """ unpeeled = obj = store[sha] obj_class = object_class(obj.type_name) while obj_class is Tag: assert isinstance(obj, Tag) obj_class, sha = obj.object obj = store[sha] return unpeeled, obj class GraphTraversalReachability: """Naive graph traversal implementation of ObjectReachabilityProvider. This implementation wraps existing graph traversal functions (_collect_ancestors, _collect_filetree_revs) to provide the standard reachability interface without any performance optimizations. """ def __init__(self, object_store: BaseObjectStore) -> None: """Initialize the graph traversal provider. Args: object_store: Object store to query """ self.store = object_store def get_reachable_commits( self, heads: Iterable[ObjectID], exclude: Iterable[ObjectID] | None = None, shallow: Set[ObjectID] | None = None, ) -> set[ObjectID]: """Get all commits reachable from heads, excluding those in exclude. Uses _collect_ancestors for commit traversal. Args: heads: Starting commit SHAs exclude: Commit SHAs to exclude (and their ancestors) shallow: Set of shallow commit boundaries Returns: Set of commit SHAs reachable from heads but not from exclude """ exclude_set = frozenset(exclude) if exclude else frozenset() shallow_set = frozenset(shallow) if shallow else frozenset() commits, _bases = _collect_ancestors( self.store, heads, exclude_set, shallow_set ) return commits def get_tree_objects( self, tree_shas: Iterable[ObjectID], ) -> set[ObjectID]: """Get all trees and blobs reachable from the given trees. Uses _collect_filetree_revs for tree traversal. Args: tree_shas: Starting tree SHAs Returns: Set of tree and blob SHAs """ result: set[ObjectID] = set() for tree_sha in tree_shas: _collect_filetree_revs(self.store, tree_sha, result) return result def get_reachable_objects( self, commits: Iterable[ObjectID], exclude_commits: Iterable[ObjectID] | None = None, ) -> set[ObjectID]: """Get all objects (commits + trees + blobs) reachable from commits. Args: commits: Starting commit SHAs exclude_commits: Commits whose objects should be excluded Returns: Set of all object SHAs (commits, trees, blobs) """ commits_set = set(commits) result = set(commits_set) # Get trees for all commits tree_shas = [] for commit_sha in commits_set: try: commit = self.store[commit_sha] if isinstance(commit, Commit): tree_shas.append(commit.tree) except KeyError: # Commit not in store, skip continue # Collect all tree/blob objects result.update(self.get_tree_objects(tree_shas)) # Exclude objects from exclude_commits if needed if exclude_commits: exclude_objects = self.get_reachable_objects(exclude_commits, None) result -= exclude_objects return result class BitmapReachability: """Bitmap-accelerated implementation of ObjectReachabilityProvider. This implementation uses packfile bitmap indexes where available to accelerate reachability queries. Falls back to graph traversal when bitmaps don't cover the requested commits. """ def __init__(self, object_store: "PackBasedObjectStore") -> None: """Initialize the bitmap provider. Args: object_store: Pack-based object store with bitmap support """ self.store = object_store # Fallback to graph traversal for operations not yet optimized self._fallback = GraphTraversalReachability(object_store) def _combine_commit_bitmaps( self, commit_shas: set[ObjectID], exclude_shas: set[ObjectID] | None = None, ) -> tuple["EWAHBitmap", "Pack"] | None: """Combine bitmaps for multiple commits using OR, with optional exclusion. Args: commit_shas: Set of commit SHAs to combine exclude_shas: Optional set of commit SHAs to exclude Returns: Tuple of (combined_bitmap, pack) or None if bitmaps unavailable """ from .bitmap import find_commit_bitmaps # Find bitmaps for the commits commit_bitmaps = find_commit_bitmaps(commit_shas, self.store.packs) # If we can't find bitmaps for all commits, return None if len(commit_bitmaps) < len(commit_shas): return None # Combine bitmaps using OR combined_bitmap = None result_pack = None for commit_sha in commit_shas: pack, pack_bitmap, _sha_to_pos = commit_bitmaps[commit_sha] commit_bitmap = pack_bitmap.get_bitmap(commit_sha) if commit_bitmap is None: return None if combined_bitmap is None: combined_bitmap = commit_bitmap result_pack = pack elif pack == result_pack: # Same pack, can OR directly combined_bitmap = combined_bitmap | commit_bitmap else: # Different packs, can't combine return None # Handle exclusions if provided if exclude_shas and result_pack and combined_bitmap: exclude_bitmaps = find_commit_bitmaps(exclude_shas, [result_pack]) if len(exclude_bitmaps) == len(exclude_shas): # All excludes have bitmaps, compute exclusion exclude_combined = None for commit_sha in exclude_shas: _pack, pack_bitmap, _sha_to_pos = exclude_bitmaps[commit_sha] exclude_bitmap = pack_bitmap.get_bitmap(commit_sha) if exclude_bitmap is None: break if exclude_combined is None: exclude_combined = exclude_bitmap else: exclude_combined = exclude_combined | exclude_bitmap # Subtract excludes using set difference if exclude_combined: combined_bitmap = combined_bitmap - exclude_combined if combined_bitmap and result_pack: return (combined_bitmap, result_pack) return None def get_reachable_commits( self, heads: Iterable[ObjectID], exclude: Iterable[ObjectID] | None = None, shallow: Set[ObjectID] | None = None, ) -> set[ObjectID]: """Get all commits reachable from heads using bitmaps where possible. Args: heads: Starting commit SHAs exclude: Commit SHAs to exclude (and their ancestors) shallow: Set of shallow commit boundaries Returns: Set of commit SHAs reachable from heads but not from exclude """ from .bitmap import bitmap_to_object_shas # If shallow is specified, fall back to graph traversal # (bitmaps don't support shallow boundaries well) if shallow: return self._fallback.get_reachable_commits(heads, exclude, shallow) heads_set = set(heads) exclude_set = set(exclude) if exclude else None # Try to combine bitmaps result = self._combine_commit_bitmaps(heads_set, exclude_set) if result is None: return self._fallback.get_reachable_commits(heads, exclude, shallow) combined_bitmap, result_pack = result # Convert bitmap to commit SHAs, filtering for commits only pack_bitmap = result_pack.bitmap if pack_bitmap is None: return self._fallback.get_reachable_commits(heads, exclude, shallow) commit_type_filter = pack_bitmap.commit_bitmap return bitmap_to_object_shas( combined_bitmap, result_pack.index, commit_type_filter ) def get_tree_objects( self, tree_shas: Iterable[ObjectID], ) -> set[ObjectID]: """Get all trees and blobs reachable from the given trees. Args: tree_shas: Starting tree SHAs Returns: Set of tree and blob SHAs """ # Tree traversal doesn't benefit much from bitmaps, use fallback return self._fallback.get_tree_objects(tree_shas) def get_reachable_objects( self, commits: Iterable[ObjectID], exclude_commits: Iterable[ObjectID] | None = None, ) -> set[ObjectID]: """Get all objects reachable from commits using bitmaps. Args: commits: Starting commit SHAs exclude_commits: Commits whose objects should be excluded Returns: Set of all object SHAs (commits, trees, blobs) """ from .bitmap import bitmap_to_object_shas commits_set = set(commits) exclude_set = set(exclude_commits) if exclude_commits else None # Try to combine bitmaps result = self._combine_commit_bitmaps(commits_set, exclude_set) if result is None: return self._fallback.get_reachable_objects(commits, exclude_commits) combined_bitmap, result_pack = result # Convert bitmap to all object SHAs (no type filter) return bitmap_to_object_shas(combined_bitmap, result_pack.index, None) dulwich-1.0.0/dulwich/objects.py000066400000000000000000002240621513301442600166160ustar00rootroot00000000000000# objects.py -- Access to base git objects # Copyright (C) 2007 James Westby # Copyright (C) 2008-2013 Jelmer Vernooij # # SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later # Dulwich is dual-licensed under the Apache License, Version 2.0 and the GNU # General Public License as published by the Free Software Foundation; version 2.0 # or (at your option) any later version. You can redistribute it and/or # modify it under the terms of either of these two licenses. # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # You should have received a copy of the licenses; if not, see # for a copy of the GNU General Public License # and for a copy of the Apache # License, Version 2.0. # """Access to base git objects.""" __all__ = [ "BEGIN_PGP_SIGNATURE", "BEGIN_SSH_SIGNATURE", "MAX_TIME", "OBJECT_CLASSES", "SIGNATURE_PGP", "SIGNATURE_SSH", "S_IFGITLINK", "S_ISGITLINK", "ZERO_SHA", "Blob", "Commit", "EmptyFileException", "FixedSha", "ObjectID", "RawObjectID", "ShaFile", "SubmoduleEncountered", "Tag", "Tree", "TreeEntry", "check_hexsha", "check_identity", "check_time", "filename_to_hex", "format_time_entry", "format_timezone", "git_line", "hex_to_filename", "hex_to_sha", "is_blob", "is_commit", "is_tag", "is_tree", "key_entry", "key_entry_name_order", "object_class", "object_header", "parse_commit_broken", "parse_tree", "pretty_format_tree_entry", "serializable_property", "serialize_tree", "sha_to_hex", "sorted_tree_items", "valid_hexsha", ] import binascii import os import posixpath import re import stat import sys import zlib from collections.abc import Callable, Iterable, Iterator, Sequence from hashlib import sha1 from io import BufferedIOBase, BytesIO from typing import ( IO, TYPE_CHECKING, NamedTuple, TypeVar, ) if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self from typing import NewType, TypeGuard from .errors import ( ChecksumMismatch, FileFormatException, NotBlobError, NotCommitError, NotTagError, NotTreeError, ObjectFormatException, ) from .file import GitFile from .object_format import DEFAULT_OBJECT_FORMAT, ObjectFormat if TYPE_CHECKING: from _hashlib import HASH from .file import _GitFile # Zero SHA constants for backward compatibility - now defined below as ObjectID # Header fields for commits _TREE_HEADER = b"tree" _PARENT_HEADER = b"parent" _AUTHOR_HEADER = b"author" _COMMITTER_HEADER = b"committer" _ENCODING_HEADER = b"encoding" _MERGETAG_HEADER = b"mergetag" _GPGSIG_HEADER = b"gpgsig" # Header fields for objects _OBJECT_HEADER = b"object" _TYPE_HEADER = b"type" _TAG_HEADER = b"tag" _TAGGER_HEADER = b"tagger" S_IFGITLINK = 0o160000 # Intentionally flexible regex to support various types of brokenness # in commit/tag author/committer/tagger lines _TIME_ENTRY_RE = re.compile( b"^(?P.*) (?P